Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 111 additions & 40 deletions pkg/controllers/cloud_config_sync_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@ import (
"context"
"fmt"
"reflect"
"sync"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/api/features"
Expand All @@ -30,6 +33,11 @@ const (
// Controller conditions for the Cluster Operator resource
cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable"
cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded"

// transientDegradedThreshold is how long transient errors must persist before
// the controller sets Degraded=True. This prevents brief
// API server blips during upgrades from immediately degrading the operator.
transientDegradedThreshold = 2 * time.Minute
)

// shouldManageManagedConfigMap returns true if CCCMO should manage the
Expand Down Expand Up @@ -78,34 +86,99 @@ type CloudConfigReconciler struct {
ClusterOperatorStatusClient
Scheme *runtime.Scheme
FeatureGateAccess featuregates.FeatureGateAccess
failures failureWindow
}

// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use.
type failureWindow struct {
mu sync.Mutex
consecutiveFailureSince *time.Time
lastTransientFailureAt *time.Time
}

// clear resets the failure window. Call this on every successful reconcile.
func (fw *failureWindow) clear() {
fw.mu.Lock()
defer fw.mu.Unlock()
fw.consecutiveFailureSince = nil
fw.lastTransientFailureAt = nil
}

// observe records a transient failure at now and returns the elapsed time since
// the window started plus a boolean indicating whether the window was just opened
// or restarted. staleAfter controls stale-window detection: if the gap since the
// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable.
func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) {
fw.mu.Lock()
defer fw.mu.Unlock()
stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter
if fw.consecutiveFailureSince == nil || stale {
fw.consecutiveFailureSince = &now
fw.lastTransientFailureAt = &now
return 0, true
}
fw.lastTransientFailureAt = &now
return now.Sub(*fw.consecutiveFailureSince), false
}

// handleTransient records a transient failure and degrades only after threshold has elapsed.
// name labels log messages. staleAfter controls stale-window restart (pass 0 to disable).
// setDegraded is invoked only when the threshold is exceeded.
// Always returns a non-nil error so controller-runtime requeues with exponential backoff.
func (fw *failureWindow) handleTransient(now time.Time, staleAfter, threshold time.Duration, name string, err error, setDegraded func() error) (ctrl.Result, error) {
elapsed, started := fw.observe(now, staleAfter)
if started {
klog.V(4).Infof("%s: transient failure started (%v), will degrade after %s", name, err, threshold)
return ctrl.Result{}, err
}
if elapsed < threshold {
klog.V(4).Infof("%s: transient failure ongoing for %s (threshold %s): %v", name, elapsed, threshold, err)
return ctrl.Result{}, err
}
klog.Warningf("%s: transient failure exceeded threshold (%s), setting degraded: %v", name, elapsed, err)
if setErr := setDegraded(); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
}
return ctrl.Result{}, err
}

// handleTerminal degrades immediately and returns nil so controller-runtime does not requeue.
// An existing watch on the relevant resource will re-trigger reconciliation when fixed.
// name labels log messages.
func (fw *failureWindow) handleTerminal(name string, err error, setDegraded func() error) (ctrl.Result, error) {
klog.Errorf("%s: terminal error, setting degraded: %v", name, err)
if setErr := setDegraded(); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
}
return ctrl.Result{}, nil
Comment on lines +145 to +153
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Reset the transient window when a terminal error wins.

handleTerminal leaves consecutiveFailureSince intact. If transient failures accumulate, then a terminal misconfiguration happens, the first post-fix transient error inherits the old window and can degrade immediately even though it should start a fresh transient sequence.

Suggested fix
 func (fw *failureWindow) handleTerminal(name string, err error, setDegraded func() error) (ctrl.Result, error) {
+	fw.clear()
 	klog.Errorf("%s: terminal error, setting degraded: %v", name, err)
 	if setErr := setDegraded(); setErr != nil {
 		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %w", setErr)
 	}
 	return ctrl.Result{}, nil
 }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@pkg/controllers/cloud_config_sync_controller.go` around lines 145 - 153,
handleTerminal currently leaves fw.consecutiveFailureSince intact so a later
transient failure inherits the old window; modify fw.handleTerminal (method on
type failureWindow) to clear/reset the consecutiveFailureSince field when a
terminal error is handled (before returning) so future transient failures start
a fresh window, then proceed to call setDegraded() and return as before.

}

func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
klog.V(1).Infof("Syncing cloud-conf ConfigMap")

defer finalizeReconcile(&r.failures, r.Clock, 0, transientDegradedThreshold, "CloudConfigReconciler", r.failures.clear, degradedSetter(ctx, r.setDegradedCondition), &result, &retErr)

infra := &configv1.Infrastructure{}
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil {
klog.Errorf("infrastructure resource not found")
if err := r.setDegradedCondition(ctx); err != nil {
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) {
// No cloud platform: mirror the main controller's behaviour of returning Available.
klog.Infof("Infrastructure cluster does not exist. Skipping...")
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, nil
} else if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err)
}

network := &configv1.Network{}
if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, fmt.Errorf("failed to get cluster Network: %w", err)
}

syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// nil platformStatus is a terminal misconfiguration.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to check cloud config sync requirements: %w", err))
}
if !syncNeeded {
if err := r.setAvailableCondition(ctx); err != nil {
Expand Down Expand Up @@ -135,11 +208,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus)
if err != nil {
// Unsupported platform won't change without a cluster reconfigure.
klog.Errorf("unable to get cloud config transformer function; unsupported platform")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to get cloud config transformer: %w", err))
}

platformType := infra.Status.PlatformStatus.Type
Expand All @@ -161,14 +232,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil {
managedConfigFound = true
} else if errors.IsNotFound(err) {
} else if apierrors.IsNotFound(err) {
klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
} else if err != nil {
klog.Errorf("unable to get managed cloud-config for sync")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
} else {
return ctrl.Result{}, fmt.Errorf("failed to get managed cloud config: %w", err)
}
}

Expand All @@ -179,7 +246,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
Name: infra.Spec.CloudConfig.Name,
Namespace: OpenshiftConfigNamespace,
}
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) {
klog.Warningf("cloud-config not found in either openshift-config-managed or openshift-config namespace")
// For platforms we manage, create a minimal valid config that will be populated by the transformer
if shouldManageManagedConfigMap(platformType, features) {
Expand All @@ -200,23 +267,30 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to prepare source cloud config: %w", err))
}

// Apply transformer if needed
if r.FeatureGateAccess == nil {
// Operator misconfiguration at startup: Terminal.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured"))
}

features, err = r.FeatureGateAccess.CurrentFeatureGates()
if err != nil {
// The feature-gate informer may not have synced yet: transient.
klog.Errorf("unable to get feature gates: %v", err)
return ctrl.Result{}, fmt.Errorf("failed to get feature gates: %w", err)
}
if cloudConfigTransformerFn != nil {
// We ignore stuff in sourceCM.BinaryData. This isn't allowed to
// contain any key that overlaps with those found in sourceCM.Data and
// we're not expecting users to put their data in the former.
output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// Platform-specific transform failed on the current config data: terminal.
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to transform cloud config: %w", err))
}
sourceCM.Data[defaultConfigKey] = output
}
Expand All @@ -229,17 +303,14 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}
}

// Sync the transformed config to the target configmap for CCM consumption
if err := r.syncCloudConfigData(ctx, sourceCM); err != nil {
klog.Errorf("unable to sync cloud config")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, fmt.Errorf("failed to sync cloud config: %w", err)
}

if err := r.setAvailableCondition(ctx); err != nil {
Expand Down Expand Up @@ -335,7 +406,7 @@ func (r *CloudConfigReconciler) syncConfigMapToTarget(ctx context.Context, sourc

// Check if target exists
err := r.Get(ctx, targetKey, targetCM)
if err != nil && !errors.IsNotFound(err) {
if err != nil && !apierrors.IsNotFound(err) {
return fmt.Errorf("failed to get target configmap %s/%s: %w", targetNamespace, targetName, err)
}

Expand Down
82 changes: 66 additions & 16 deletions pkg/controllers/cloud_config_sync_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ var _ = Describe("isCloudConfigEqual reconciler method", func() {
}

It("should return 'true' if ConfigMaps content are equal", func() {
Expect(reconciler.isCloudConfigEqual(makeManagedCloudConfig(configv1.AzurePlatformType), makeManagedCloudConfig(configv1.AzurePlatformType))).Should(BeTrue())
Expect(reconciler.isCloudConfigEqual(makeManagedCloudConfig(configv1.AzurePlatformType), makeManagedCloudConfig(configv1.AzurePlatformType))).Should(BeTrue(), "configmaps with identical content should be considered equal")
})

It("should return 'false' if ConfigMaps content are not equal", func() {
Expand All @@ -164,8 +164,7 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() {
managedCloudConfig := makeManagedCloudConfig(configv1.AzurePlatformType)

It("not prepared config should be different with managed one", func() {
_, ok := infraCloudConfig.Data[infraCloudConfKey]
Expect(ok).Should(BeTrue())
Expect(infraCloudConfig.Data).To(HaveKey(infraCloudConfKey))
Expect(reconciler.isCloudConfigEqual(infraCloudConfig, managedCloudConfig)).Should(BeFalse())
})

Expand All @@ -174,18 +173,16 @@ var _ = Describe("prepareSourceConfigMap reconciler method", func() {
Expect(err).Should(Succeed())
_, ok := preparedConfig.Data[infraCloudConfKey]
Expect(ok).Should(BeFalse())
_, ok = preparedConfig.Data[defaultConfigKey]
Expect(ok).Should(BeTrue())
Expect(reconciler.isCloudConfigEqual(preparedConfig, managedCloudConfig)).Should(BeTrue())
Expect(preparedConfig.Data).To(HaveKey(defaultConfigKey))
Expect(reconciler.isCloudConfigEqual(preparedConfig, managedCloudConfig)).Should(BeTrue(), "prepared config should have content equal to the managed cloud config")
})

It("config preparation should not touch extra fields in infra ConfigMap", func() {
extendedInfraConfig := infraCloudConfig.DeepCopy()
extendedInfraConfig.Data = map[string]string{infraCloudConfKey: "{}", "{}": "{}"}
preparedConfig, err := reconciler.prepareSourceConfigMap(extendedInfraConfig, infra)
Expect(err).Should(Succeed())
_, ok := preparedConfig.Data[defaultConfigKey]
Expect(ok).Should(BeTrue())
Expect(preparedConfig.Data).To(HaveKey(defaultConfigKey))
Expect(len(preparedConfig.Data)).Should(BeEquivalentTo(2))
})
})
Expand Down Expand Up @@ -370,8 +367,26 @@ var _ = Describe("Cloud config sync controller", func() {
}, timeout).Should(Succeed())
initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion

// Introducing the consecutiveFailureWindow means that there's a field that could be racy
// between the manager calling Reconcile and the test calling Reconcile.
// In production, we only have 1 instance of the reconciler running.
// Create a fresh reconciler that is NOT registered with the manager.
// It shares the same API client (thread-safe) but has its own
// consecutiveFailureSince field, so no data race with the manager's copy.
freshReconciler := &CloudConfigReconciler{
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
Client: cl,
Clock: clocktesting.NewFakePassiveClock(time.Now()),
ManagedNamespace: targetNamespaceName,
},
Scheme: scheme.Scheme,
FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(
nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil,
),
}

request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}}
_, err := reconciler.Reconcile(ctx, request)
_, err := freshReconciler.Reconcile(ctx, request)
Expect(err).Should(Succeed())

Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed())
Expand Down Expand Up @@ -531,7 +546,7 @@ var _ = Describe("Cloud config sync reconciler", func() {
Expect(len(allCMs.Items)).To(BeEquivalentTo(1))
})

It("should error if a user-specified configmap key isn't present", func() {
It("should degrade immediately if a user-specified configmap key isn't present", func() {
infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
infraResource.Spec.CloudConfig.Key = "notfound"
Expect(cl.Create(ctx, infraResource)).To(Succeed())
Expand All @@ -540,8 +555,19 @@ var _ = Describe("Cloud config sync reconciler", func() {
Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed())

_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap"))

Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil())
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
})

It("should continue with reconcile when feature gates are available", func() {
Expand Down Expand Up @@ -606,16 +632,40 @@ var _ = Describe("Cloud config sync reconciler", func() {
})
})

It("reconcile should fail if no infra resource found", func() {
It("reconcile should succeed and be available if no infra resource found", func() {
_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found"))
Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var availCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition {
availCond = &co.Status.Conditions[i]
break
}
}
Expect(availCond).NotTo(BeNil())
Expect(availCond.Status).To(Equal(configv1.ConditionTrue))
})

It("should fail if no PlatformStatus in infra resource presented ", func() {
It("should degrade immediately if no PlatformStatus in infra resource", func() {
infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
Expect(cl.Create(ctx, infraResource)).To(Succeed())
_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required"))
Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil())
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
})
})

Expand Down
Loading