From 6fdfc4645ca1eafe3781fd54dda07c03dc86a04f Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Tue, 23 Jun 2026 22:36:36 +0200 Subject: [PATCH 1/3] Add internal failure category constants to errormatch Signed-off-by: Dejan Zele Pejchev --- internal/common/errormatch/types.go | 18 ++++++++++++++ internal/common/errormatch/types_test.go | 30 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 internal/common/errormatch/types_test.go diff --git a/internal/common/errormatch/types.go b/internal/common/errormatch/types.go index ab903ea05a6..540973fa01b 100644 --- a/internal/common/errormatch/types.go +++ b/internal/common/errormatch/types.go @@ -44,3 +44,21 @@ var KnownConditions = map[string]bool{ ConditionEvicted: true, ConditionDeadlineExceeded: true, } + +// CategoryInternal is the failure category for Armada-generated failures. It is +// hard-coded at the Error-construction site, not assigned by the operator +// categorizer. +const CategoryInternal = "internal" + +// Subcategories of CategoryInternal. +const ( + SubcategoryJobCreationFailed = "job-creation-failed" // executor failed to build the runnable job from the lease + SubcategoryPodMissing = "pod-missing" // reconciliation: pod vanished from Kubernetes for an active run + SubcategoryLeaseExpired = "lease-expired" // run cancelled because its executor went stale or was lost + SubcategoryMaxRunsExceeded = "max-runs-exceeded" // job exhausted its run attempts + SubcategoryJobRejected = "job-rejected" // submitcheck or validation rejected the job + SubcategoryStuckTerminating = "stuck-terminating" // node issue: the pod will not terminate + SubcategoryExternallyDeleted = "externally-deleted" // pod deleted out-of-band (not by Armada) + SubcategoryIssueHandlerError = "issue-handler-error" // pod unexpectedly changed state mid issue-handling + SubcategoryActiveDeadline = "active-deadline" // pod exceeded its user-set activeDeadlineSeconds +) diff --git a/internal/common/errormatch/types_test.go b/internal/common/errormatch/types_test.go new file mode 100644 index 00000000000..d4f9e3e5719 --- /dev/null +++ b/internal/common/errormatch/types_test.go @@ -0,0 +1,30 @@ +package errormatch + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// maxFailureSubcategoryLen is the lookout job_run.failure_subcategory varchar(63) +// bound (lookout migration 032). +const maxFailureSubcategoryLen = 63 + +func TestInternalCategoryConstants_WithinLengthBound(t *testing.T) { + values := map[string]string{ + "CategoryInternal": CategoryInternal, + "SubcategoryJobCreationFailed": SubcategoryJobCreationFailed, + "SubcategoryPodMissing": SubcategoryPodMissing, + "SubcategoryLeaseExpired": SubcategoryLeaseExpired, + "SubcategoryMaxRunsExceeded": SubcategoryMaxRunsExceeded, + "SubcategoryJobRejected": SubcategoryJobRejected, + "SubcategoryStuckTerminating": SubcategoryStuckTerminating, + "SubcategoryExternallyDeleted": SubcategoryExternallyDeleted, + "SubcategoryIssueHandlerError": SubcategoryIssueHandlerError, + "SubcategoryActiveDeadline": SubcategoryActiveDeadline, + } + for name, value := range values { + assert.NotEmpty(t, value, "%s must not be empty", name) + assert.LessOrEqual(t, len(value), maxFailureSubcategoryLen, "%s = %q exceeds %d chars", name, value, maxFailureSubcategoryLen) + } +} From 7dc065335c7f49d50d50763c0f7bb78274696115 Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Tue, 23 Jun 2026 22:36:36 +0200 Subject: [PATCH 2/3] Categorize Armada-authored executor failures as internal Signed-off-by: Dejan Zele Pejchev --- internal/executor/reporter/event.go | 13 +++--- internal/executor/reporter/event_test.go | 13 ++++++ internal/executor/service/job_requester.go | 3 ++ .../executor/service/pod_issue_handler.go | 42 +++++++++++++++---- .../service/pod_issue_handler_test.go | 35 ++++++++++------ 5 files changed, 81 insertions(+), 25 deletions(-) diff --git a/internal/executor/reporter/event.go b/internal/executor/reporter/event.go index 1119d781310..017697c62c5 100644 --- a/internal/executor/reporter/event.go +++ b/internal/executor/reporter/event.go @@ -209,10 +209,9 @@ func CreateSimpleJobPreemptedEvent(pod *v1.Pod) (*armadaevents.EventSequence, er return sequence, nil } -// CreateSimpleJobFailedEvent creates a failed event with no container details and no classification. -// Use for failures where pod container statuses are unavailable (preemption, submit failures). -// failure_category/failure_subcategory are left empty, resulting in NULL in the DB. -// This is intentional: these failures are not classifiable pod errors. +// CreateSimpleJobFailedEvent creates a failed event with no container details or +// failure category, for failures where pod container statuses are unavailable +// (preemption, submit failures). func CreateSimpleJobFailedEvent(pod *v1.Pod, reason string, clusterId string, cause armadaevents.KubernetesReason) (*armadaevents.EventSequence, error) { return CreateJobFailedEvent(pod, reason, cause, "", []*armadaevents.ContainerError{}, clusterId, "", "") } @@ -261,7 +260,7 @@ func CreateJobFailedEvent(pod *v1.Pod, reason string, cause armadaevents.Kuberne return sequence, nil } -func CreateMinimalJobFailedEvent(jobId string, runId string, jobSet string, queue string, clusterId string, message string) (*armadaevents.EventSequence, error) { +func CreateMinimalJobFailedEvent(jobId string, runId string, jobSet string, queue string, clusterId string, message string, failureCategory string, failureSubcategory string) (*armadaevents.EventSequence, error) { sequence := &armadaevents.EventSequence{} sequence.Queue = queue sequence.JobSetName = jobSet @@ -274,7 +273,9 @@ func CreateMinimalJobFailedEvent(jobId string, runId string, jobSet string, queu JobId: jobId, Errors: []*armadaevents.Error{ { - Terminal: true, + Terminal: true, + FailureCategory: failureCategory, + FailureSubcategory: failureSubcategory, Reason: &armadaevents.Error_PodError{ PodError: &armadaevents.PodError{ ObjectMeta: &armadaevents.ObjectMeta{ diff --git a/internal/executor/reporter/event_test.go b/internal/executor/reporter/event_test.go index 3e8978eedea..808cd47303c 100644 --- a/internal/executor/reporter/event_test.go +++ b/internal/executor/reporter/event_test.go @@ -314,3 +314,16 @@ func createService(serviceType v1.ServiceType, port int32, nodePort int32) *v1.S }, } } + +func TestCreateMinimalJobFailedEvent_SetsInternalCategory(t *testing.T) { + result, err := CreateMinimalJobFailedEvent("job1", "run1", "jobset1", "queue1", "cluster1", "pod missing", errormatch.CategoryInternal, errormatch.SubcategoryPodMissing) + require.NoError(t, err) + + require.Len(t, result.Events, 1) + event, ok := result.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors) + require.True(t, ok) + require.Len(t, event.JobRunErrors.Errors, 1) + assert.NotNil(t, event.JobRunErrors.Errors[0].GetPodError()) + assert.Equal(t, errormatch.CategoryInternal, event.JobRunErrors.Errors[0].GetFailureCategory()) + assert.Equal(t, errormatch.SubcategoryPodMissing, event.JobRunErrors.Errors[0].GetFailureSubcategory()) +} diff --git a/internal/executor/service/job_requester.go b/internal/executor/service/job_requester.go index f10b9f20ed3..9f9d4ba7b1f 100644 --- a/internal/executor/service/job_requester.go +++ b/internal/executor/service/job_requester.go @@ -7,6 +7,7 @@ import ( "golang.org/x/exp/maps" "github.com/armadaproject/armada/internal/common/armadacontext" + "github.com/armadaproject/armada/internal/common/errormatch" log "github.com/armadaproject/armada/internal/common/logging" "github.com/armadaproject/armada/internal/common/slices" "github.com/armadaproject/armada/internal/executor/configuration" @@ -187,6 +188,8 @@ func (r *JobRequester) sendFailedEvent(details *failedJobCreationDetails) error details.JobRunMeta.Queue, r.clusterId.GetClusterId(), details.Error.Error(), + errormatch.CategoryInternal, + errormatch.SubcategoryJobCreationFailed, ) if err != nil { return fmt.Errorf("failed to create job failed events because %s", err) diff --git a/internal/executor/service/pod_issue_handler.go b/internal/executor/service/pod_issue_handler.go index 768e5b709bc..68d8c6a3e06 100644 --- a/internal/executor/service/pod_issue_handler.go +++ b/internal/executor/service/pod_issue_handler.go @@ -12,6 +12,7 @@ import ( "k8s.io/utils/clock" "github.com/armadaproject/armada/internal/common/armadacontext" + "github.com/armadaproject/armada/internal/common/errormatch" log "github.com/armadaproject/armada/internal/common/logging" "github.com/armadaproject/armada/internal/executor/categorizer" "github.com/armadaproject/armada/internal/executor/configuration" @@ -424,11 +425,17 @@ func (p *PodIssueHandler) handleNonRetryableJobIssue(issue *issue) { if !issue.RunIssue.Reported { log.Infof("Handling non-retryable issue detected for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId) podIssue := issue.RunIssue.PodIssue - - result := p.classifier.ClassifyPodError(podIssue.OriginalPodState, podIssue.Message) clusterId := p.clusterContext.GetClusterId() - message := result.AppendHint(podIssue.Message) + var failureCategory, failureSubcategory, message string + if sub := internalSubcategoryForPodIssueType(podIssue.Type); sub != "" { + failureCategory, failureSubcategory = errormatch.CategoryInternal, sub + message = podIssue.Message + } else { + result := p.classifier.ClassifyPodError(podIssue.OriginalPodState, podIssue.Message) + failureCategory, failureSubcategory = result.Category, result.Subcategory + message = result.AppendHint(podIssue.Message) + } failedEvent, err := reporter.CreateJobFailedEvent( podIssue.OriginalPodState, @@ -437,8 +444,8 @@ func (p *PodIssueHandler) handleNonRetryableJobIssue(issue *issue) { podIssue.DebugMessage, util.ExtractFailedPodContainerStatuses(podIssue.OriginalPodState, clusterId), clusterId, - result.Category, - result.Subcategory) + failureCategory, + failureSubcategory) if err != nil { log.Errorf("Failed to create failed event for job %s because %s", issue.RunIssue.JobId, err) return @@ -450,7 +457,7 @@ func (p *PodIssueHandler) handleNonRetryableJobIssue(issue *issue) { } // Increment only after successful Report so failed sends do not inflate the counter. // RecordJobFailure is a no-op when classification didn't run (empty category). - metrics.RecordJobFailure(result.Category, result.Subcategory) + metrics.RecordJobFailure(failureCategory, failureSubcategory) p.markIssueReported(issue.RunIssue) } @@ -462,6 +469,25 @@ func (p *PodIssueHandler) handleNonRetryableJobIssue(issue *issue) { } } +// internalSubcategoryForPodIssueType returns the internal failure subcategory for +// Armada-detected structural pod issues. It returns "" for StuckStartingUp, +// UnableToSchedule, and FailedStartingUp, whose cause (e.g. image pull, +// scheduling) the operator categorizer should attribute instead. +func internalSubcategoryForPodIssueType(t podIssueType) string { + switch t { + case StuckTerminating: + return errormatch.SubcategoryStuckTerminating + case ExternallyDeleted: + return errormatch.SubcategoryExternallyDeleted + case ErrorDuringIssueHandling: + return errormatch.SubcategoryIssueHandlerError + case ActiveDeadlineExceeded: + return errormatch.SubcategoryActiveDeadline + default: + return "" + } +} + // For retryable issues we must: // - Report JobReturnLeaseEvent // @@ -611,7 +637,9 @@ func (p *PodIssueHandler) handleReconciliationIssue(issue *issue) { currentRunState.Meta.JobSet, currentRunState.Meta.Queue, p.clusterContext.GetClusterId(), - fmt.Sprintf("Pod is unexpectedly missing in Kubernetes"), + "Pod is unexpectedly missing in Kubernetes", + errormatch.CategoryInternal, + errormatch.SubcategoryPodMissing, ) if err != nil { log.Errorf("failed to create job failed event because %s", err) diff --git a/internal/executor/service/pod_issue_handler_test.go b/internal/executor/service/pod_issue_handler_test.go index 8b1a7f3794f..99eced96d63 100644 --- a/internal/executor/service/pod_issue_handler_test.go +++ b/internal/executor/service/pod_issue_handler_test.go @@ -95,9 +95,10 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckAndUnretryable(t *tes assert.Len(t, failedEvent.JobRunErrors.Errors, 1) assert.Contains(t, failedEvent.JobRunErrors.Errors[0].GetPodError().Message, "unrecoverable problem") assert.Contains(t, failedEvent.JobRunErrors.Errors[0].GetPodError().DebugMessage, "Image pull has failed") + assert.NotEqual(t, errormatch.CategoryInternal, failedEvent.JobRunErrors.Errors[0].GetFailureCategory()) } -func TestPodIssueService_FailureCategorySet_WhenClassifierConfigured(t *testing.T) { +func TestPodIssueService_StructuralIssueIsInternal_RegardlessOfClassifier(t *testing.T) { classifier, err := categorizer.NewClassifier(categorizer.ErrorCategoriesConfig{ Categories: []categorizer.CategoryConfig{ { @@ -130,7 +131,6 @@ func TestPodIssueService_FailureCategorySet_WhenClassifierConfigured(t *testing. ) require.NoError(t, err) - // Stuck terminating pod with an OOMKilled container - the classifier should match pod := makeTerminatingPod() pod.Status.ContainerStatuses = []v1.ContainerStatus{ { @@ -148,8 +148,8 @@ func TestPodIssueService_FailureCategorySet_WhenClassifierConfigured(t *testing. failedEvent, ok := eventReporter.ReceivedEvents[0].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors) require.True(t, ok) - assert.Equal(t, "oom-failure", failedEvent.JobRunErrors.Errors[0].GetFailureCategory()) - assert.Equal(t, "kernel-oom", failedEvent.JobRunErrors.Errors[0].GetFailureSubcategory()) + assert.Equal(t, errormatch.CategoryInternal, failedEvent.JobRunErrors.Errors[0].GetFailureCategory()) + assert.Equal(t, errormatch.SubcategoryStuckTerminating, failedEvent.JobRunErrors.Errors[0].GetFailureSubcategory()) // Verify ContainerErrors are populated (not empty) so retry engine has fallback data assert.NotEmpty(t, failedEvent.JobRunErrors.Errors[0].GetPodError().ContainerErrors) } @@ -175,14 +175,6 @@ func TestPodIssueService_OnPodErrorClassifies(t *testing.T) { }, expectMessageContains: "no match for platform in manifest", }, - "active deadline exceeded from executor-side detection": { - category: "user_error", - subcategory: "deadline_exceeded", - pattern: "exceeded active deadline", - // 10 minutes old with 5 minute deadline -> exceeded. - pod: func() *v1.Pod { return makePodWithDeadline(time.Now().Add(-time.Minute*10), 300, 0) }, - expectMessageContains: "exceeded active deadline", - }, } for name, tc := range tests { @@ -243,6 +235,8 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckTerminating(t *testin assert.True(t, ok) assert.Len(t, failedEvent.JobRunErrors.Errors, 1) assert.Contains(t, failedEvent.JobRunErrors.Errors[0].GetPodError().Message, "terminating") + assert.Equal(t, errormatch.CategoryInternal, failedEvent.JobRunErrors.Errors[0].GetFailureCategory()) + assert.Equal(t, errormatch.SubcategoryStuckTerminating, failedEvent.JobRunErrors.Errors[0].GetFailureSubcategory()) } func TestPodIssueService_HasIssue(t *testing.T) { @@ -401,6 +395,8 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *t assert.True(t, ok) assert.Len(t, failedEvent.JobRunErrors.Errors, 1) assert.Contains(t, failedEvent.JobRunErrors.Errors[0].GetPodError().Message, "exceeded active deadline") + assert.Equal(t, errormatch.CategoryInternal, failedEvent.JobRunErrors.Errors[0].GetFailureCategory()) + assert.Equal(t, errormatch.SubcategoryActiveDeadline, failedEvent.JobRunErrors.Errors[0].GetFailureSubcategory()) } else { assert.Equal(t, []*v1.Pod{tc.pod}, remainingActivePods) assert.Len(t, eventsReporter.ReceivedEvents, 0) @@ -870,3 +866,18 @@ func TestCreateDebugMessage(t *testing.T) { }) } } + +func TestInternalSubcategoryForPodIssueType(t *testing.T) { + tests := map[podIssueType]string{ + StuckTerminating: errormatch.SubcategoryStuckTerminating, + ExternallyDeleted: errormatch.SubcategoryExternallyDeleted, + ErrorDuringIssueHandling: errormatch.SubcategoryIssueHandlerError, + ActiveDeadlineExceeded: errormatch.SubcategoryActiveDeadline, + StuckStartingUp: "", + UnableToSchedule: "", + FailedStartingUp: "", + } + for issueType, want := range tests { + assert.Equal(t, want, internalSubcategoryForPodIssueType(issueType)) + } +} From 4c47705fe68998410c240d5bccf52c0299aa145b Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Tue, 23 Jun 2026 22:36:36 +0200 Subject: [PATCH 3/3] Categorize Armada-authored scheduler failures as internal Signed-off-by: Dejan Zele Pejchev --- internal/scheduler/scheduler.go | 13 +++++++-- internal/scheduler/scheduler_test.go | 41 ++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 70dd98c804c..f10ce5eddb8 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -13,6 +13,7 @@ import ( "github.com/armadaproject/armada/internal/common/armadacontext" "github.com/armadaproject/armada/internal/common/constants" + "github.com/armadaproject/armada/internal/common/errormatch" protoutil "github.com/armadaproject/armada/internal/common/proto" armadaslices "github.com/armadaproject/armada/internal/common/slices" "github.com/armadaproject/armada/internal/leaderelection" @@ -1038,7 +1039,9 @@ func (s *Scheduler) generateUpdateMessagesFromJob(ctx *armadacontext.Context, jo } runError = &armadaevents.Error{ - Terminal: true, + Terminal: true, + FailureCategory: errormatch.CategoryInternal, + FailureSubcategory: errormatch.SubcategoryMaxRunsExceeded, Reason: &armadaevents.Error_MaxRunsExceeded{ MaxRunsExceeded: &armadaevents.MaxRunsExceeded{ Message: errorMessage, @@ -1131,7 +1134,9 @@ func (s *Scheduler) expireJobsIfNecessary(ctx *armadacontext.Context, txn *jobdb jobsToUpdate = append(jobsToUpdate, job.WithQueued(false).WithFailed(true).WithUpdatedRun(run.WithFailed(true))) leaseExpiredError := &armadaevents.Error{ - Terminal: true, + Terminal: true, + FailureCategory: errormatch.CategoryInternal, + FailureSubcategory: errormatch.SubcategoryLeaseExpired, Reason: &armadaevents.Error_LeaseExpired{ LeaseExpired: &armadaevents.LeaseExpired{}, }, @@ -1241,7 +1246,9 @@ func (s *Scheduler) submitCheck(ctx *armadacontext.Context, txn *jobdb.Txn) ([]* JobId: job.Id(), Errors: []*armadaevents.Error{ { - Terminal: true, + Terminal: true, + FailureCategory: errormatch.CategoryInternal, + FailureSubcategory: errormatch.SubcategoryJobRejected, Reason: &armadaevents.Error_JobRejected{ JobRejected: &armadaevents.JobRejected{ Message: result.reason, diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index 5a3368a70a2..849509a33c5 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -23,6 +23,7 @@ import ( "github.com/armadaproject/armada/internal/common/armadacontext" "github.com/armadaproject/armada/internal/common/armadaerrors" apiconfig "github.com/armadaproject/armada/internal/common/constants" + "github.com/armadaproject/armada/internal/common/errormatch" "github.com/armadaproject/armada/internal/common/ingest/utils" protoutil "github.com/armadaproject/armada/internal/common/proto" "github.com/armadaproject/armada/internal/common/pulsarutils" @@ -1085,6 +1086,8 @@ func TestScheduler_TestCycle(t *testing.T) { assert.Empty(t, m, "%d outstanding eventSequences of type %s", len(m), eventType) } + assertInternalFailureCategories(t, publisher.eventSequences) + // assert that the serials are where we expect them to be. // On publish failure the cursor must not advance, so that the next cycle re-fetches // and re-publishes the same updates. This is what guarantees at-least-once delivery @@ -4048,3 +4051,41 @@ func TestAppendEventSequencesFromPreemptedJobs_NilPreemptingJob(t *testing.T) { assert.Equal(t, preemptedRun.Id(), preemptedEvent.PreemptedRunId) assert.Equal(t, "", preemptedEvent.PreemptingJobId) } + +// assertInternalFailureCategories checks that each published lease-expired, +// max-runs-exceeded, and job-rejected error carries the internal category and +// its matching subcategory. +func assertInternalFailureCategories(t *testing.T, sequences []*armadaevents.EventSequence) { + t.Helper() + for _, es := range sequences { + for _, event := range es.Events { + var runError *armadaevents.Error + switch e := event.Event.(type) { + case *armadaevents.EventSequence_Event_JobRunErrors: + if len(e.JobRunErrors.Errors) > 0 { + runError = e.JobRunErrors.Errors[0] + } + case *armadaevents.EventSequence_Event_JobErrors: + if len(e.JobErrors.Errors) > 0 { + runError = e.JobErrors.Errors[0] + } + } + if runError == nil { + continue + } + var wantSubcategory string + switch runError.Reason.(type) { + case *armadaevents.Error_LeaseExpired: + wantSubcategory = errormatch.SubcategoryLeaseExpired + case *armadaevents.Error_MaxRunsExceeded: + wantSubcategory = errormatch.SubcategoryMaxRunsExceeded + case *armadaevents.Error_JobRejected: + wantSubcategory = errormatch.SubcategoryJobRejected + default: + continue + } + assert.Equal(t, errormatch.CategoryInternal, runError.GetFailureCategory()) + assert.Equal(t, wantSubcategory, runError.GetFailureSubcategory()) + } + } +}