From 0d7f22e24bece5dc8b4b32cb17c8a856bdf87c09 Mon Sep 17 00:00:00 2001 From: Simone Tiraboschi Date: Tue, 16 Jun 2026 11:26:45 +0200 Subject: [PATCH] fix(engine): reset thrashing state when user removes reconcile-paused annotation When a user removes the platform.kubevirt.io/reconcile-paused annotation to resume reconciliation, the in-memory ThrashingDetector state (consecutiveThrottles >= ThrashingThreshold) was never cleared. If the token bucket was still empty at that point (< 6 s after the pause was set), the very next throttled reconciliation immediately re-paused the resource, making it permanently stuck. Add Step 1.6 in ReconcileAsset: after the IsPaused guard returns false, check whether consecutiveThrottles has already reached the threshold. If so the operator previously set the annotation and the user just removed it, so reset the thrashing detector for that resource. Subsequent throttles now count from 0 and will not re-trigger a pause until a genuine new edit war accumulates enough consecutive throttles. Fixes: CNV-89796 Signed-off-by: Simone Tiraboschi --- pkg/engine/patcher.go | 21 +++++++++ pkg/engine/patcher_test.go | 97 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/pkg/engine/patcher.go b/pkg/engine/patcher.go index 7688e42..8cb6f32 100644 --- a/pkg/engine/patcher.go +++ b/pkg/engine/patcher.go @@ -184,6 +184,27 @@ func (p *Patcher) ReconcileAsset(ctx context.Context, assetMeta *assets.AssetMet return false, nil } + // Step 1.6: Clear stale thrashing state when the user removes the pause annotation. + // If in-memory state already reached the threshold (meaning the operator previously + // set the pause annotation) but the annotation is now absent, the user intentionally + // resumed reconciliation. Reset the counter so the next cycle starts from zero: + // the token bucket may still be empty and cause throttles, but those start from 0 + // and will not immediately re-trigger a pause. + if liveExists { + earlyKey := throttling.MakeResourceKey( + desired.GetNamespace(), + desired.GetName(), + desired.GetKind(), + ) + if p.thrashingDetector.GetAttempts(earlyKey) >= throttling.ThrashingThreshold { + logger.Info("Pause annotation removed by user, resetting thrashing state", + "name", assetMeta.Name, + "key", earlyKey, + ) + p.thrashingDetector.Reset(earlyKey) + } + } + // Step 2: Check opt-out annotation (mode: unmanaged) if liveExists && overrides.IsUnmanaged(live) { logger.V(1).Info("Asset is unmanaged, skipping", diff --git a/pkg/engine/patcher_test.go b/pkg/engine/patcher_test.go index 6950e41..c5ac476 100644 --- a/pkg/engine/patcher_test.go +++ b/pkg/engine/patcher_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" pkgassets "github.com/kubevirt/virt-platform-autopilot/pkg/assets" @@ -282,6 +283,102 @@ func TestThrashingEventEmittedOnlyOnce(t *testing.T) { } } +// TestThrashingStateResetOnPauseAnnotationRemoval reproduces the race where a user removes +// the reconcile-paused annotation before the token bucket has refilled (< 6 s after pause). +// Without the fix the in-memory consecutiveThrottles stays >= ThrashingThreshold, so the very +// next throttled reconciliation immediately re-pauses the resource. +// With the fix the thrashing state is cleared as soon as the pause annotation is absent, +// so subsequent throttles start from 0 and do not trigger a re-pause. +func TestThrashingStateResetOnPauseAnnotationRemoval(t *testing.T) { + loader := pkgassets.NewLoader() + renderer := NewRenderer(loader) + + assetMeta := &pkgassets.AssetMetadata{ + Name: "psi-enable", + Path: "active/machine-config/04-psi-enable.yaml", + Component: "MachineConfig", + } + + hco := pkgcontext.NewMockHCO("kubevirt-hyperconverged", "kubevirt-hyperconverged") + renderCtx := pkgcontext.NewRenderContext(hco) + + desired, err := renderer.RenderAsset(assetMeta, renderCtx) + if err != nil { + t.Fatalf("failed to render asset: %v", err) + } + + // Bucket: capacity 1, very long window so no natural refill during the test. + // Call 1 → token consumed; calls 2+ → throttled. + // After ThrashingThreshold throttles the operator sets the pause annotation. + smallBucket := throttling.NewTokenBucketWithSettings(1, time.Hour) + td := throttling.NewThrashingDetector() + + live := desired.DeepCopy() + fakeClient := fake.NewClientBuilder().WithObjects(live).Build() + + p := &Patcher{ + renderer: renderer, + applier: NewApplier(fakeClient, nil), + driftDetector: &alwaysDriftChecker{}, + throttle: smallBucket, + thrashingDetector: td, + client: fakeClient, + } + + // Drive the patcher until it sets the pause annotation. + // Call 1 consumes the token; calls 2, 3, 4 are throttled and hit the threshold. + for i := 0; i < throttling.ThrashingThreshold+1; i++ { + //nolint:errcheck // we only care about side-effects here + p.ReconcileAsset(context.Background(), assetMeta, renderCtx) + } + + // Verify the pause annotation was set on the live object. + paused := &unstructured.Unstructured{} + paused.SetGroupVersionKind(live.GroupVersionKind()) + if err := fakeClient.Get(context.Background(), client.ObjectKey{ + Namespace: live.GetNamespace(), + Name: live.GetName(), + }, paused); err != nil { + t.Fatalf("failed to fetch live object: %v", err) + } + annotations := paused.GetAnnotations() + if annotations == nil || annotations["platform.kubevirt.io/reconcile-paused"] != "true" { + t.Fatal("expected pause annotation to be set after edit-war threshold; was not set") + } + + // Simulate user removing the pause annotation (bucket still empty — the race window). + delete(annotations, "platform.kubevirt.io/reconcile-paused") + paused.SetAnnotations(annotations) + if err := fakeClient.Update(context.Background(), paused); err != nil { + t.Fatalf("failed to remove pause annotation: %v", err) + } + + // One more reconciliation while the bucket is still empty. + // Before the fix: consecutiveThrottles was still >= threshold → immediate re-pause. + // After the fix: thrashing state is cleared → throttle count resets to 1 → no re-pause. + _, reconcileErr := p.ReconcileAsset(context.Background(), assetMeta, renderCtx) + + // The call must return a throttle error (not a pause error) and must NOT re-set the annotation. + if reconcileErr == nil { + t.Fatal("expected a throttle error (bucket still empty) but got nil") + } + if strings.Contains(reconcileErr.Error(), "edit war") { + t.Errorf("got pause-re-trigger error %q; want a plain throttle error — thrashing state was not reset", reconcileErr) + } + + afterRemoval := &unstructured.Unstructured{} + afterRemoval.SetGroupVersionKind(live.GroupVersionKind()) + if err := fakeClient.Get(context.Background(), client.ObjectKey{ + Namespace: live.GetNamespace(), + Name: live.GetName(), + }, afterRemoval); err != nil { + t.Fatalf("failed to fetch live object after annotation removal: %v", err) + } + if ann := afterRemoval.GetAnnotations(); ann != nil && ann["platform.kubevirt.io/reconcile-paused"] == "true" { + t.Error("resource was immediately re-paused after user removed the annotation; thrashing state was not cleared") + } +} + func TestCountJSONPatchOperations(t *testing.T) { tests := []struct { name string