From 1a07b675761823bd924b3fdd9170b355fc56f2f1 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 22 Jun 2026 10:05:38 -0400 Subject: [PATCH 1/2] fix: address-reviews duplicate detection and add real-data evals Co-Authored-By: Claude Opus 4.6 --- evals/budget.yaml | 3 + .../skills/address-review-pr/check_replied.py | 2 + .../annotations.yaml | 6 + .../case-001-duplicate-bot-reply/input.yaml | 18 + .../reference-analysis.json | 10 + .../annotations.yaml | 6 + .../case-002-duplicate-extreme/input.yaml | 14 + .../reference-analysis.json | 10 + .../annotations.yaml | 7 + .../case-003-categorize-question/input.yaml | 6 + .../reference-analysis.json | 10 + .../annotations.yaml | 7 + .../case-004-categorize-question-2/input.yaml | 6 + .../reference-analysis.json | 10 + .../annotations.yaml | 8 + .../input.yaml | 7 + .../reference-analysis.json | 10 + .../annotations.yaml | 7 + .../case-006-categorize-dead-code/input.yaml | 7 + .../reference-analysis.json | 10 + .../annotations.yaml | 8 + .../case-007-categorize-suggestion/input.yaml | 7 + .../reference-analysis.json | 10 + .../annotations.yaml | 7 + .../input.yaml | 6 + .../reference-analysis.json | 10 + .../annotations.yaml | 9 + .../case-009-prioritize-mixed/input.yaml | 11 + .../reference-analysis.json | 10 + .../annotations.yaml | 6 + .../input.yaml | 6 + .../reference-analysis.json | 10 + .../case-011-reply-format/annotations.yaml | 6 + .../case-011-reply-format/input.yaml | 6 + .../reference-analysis.json | 10 + .../annotations.yaml | 6 + .../case-012-ci-push-override/input.yaml | 8 + .../reference-analysis.json | 10 + .../annotations.yaml | 7 + .../input.yaml | 6 + .../reference-analysis.json | 10 + .../annotations.yaml | 6 + .../input.yaml | 6 + .../reference-analysis.json | 10 + plugins/utils/evals/eval-address-reviews.yaml | 349 ++++++++++++++++++ 45 files changed, 704 insertions(+) create mode 100644 plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-003-categorize-question/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-003-categorize-question/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-003-categorize-question/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-011-reply-format/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-011-reply-format/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-011-reply-format/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/reference-analysis.json create mode 100644 plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/annotations.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/input.yaml create mode 100644 plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/reference-analysis.json create mode 100644 plugins/utils/evals/eval-address-reviews.yaml diff --git a/evals/budget.yaml b/evals/budget.yaml index 652d956f0..845a99b15 100644 --- a/evals/budget.yaml +++ b/evals/budget.yaml @@ -67,6 +67,9 @@ budgets: jira: allowed: 7.00 current: 6.00 # 3 tests × $1.20 + 4 tests × $0.60 + utils: + allowed: 14.00 + current: 14.00 # 14 tests × $1.00 # Linter validation rules: # diff --git a/plugins/openshift-developer/skills/address-review-pr/check_replied.py b/plugins/openshift-developer/skills/address-review-pr/check_replied.py index 04246a235..6d6ce1400 100755 --- a/plugins/openshift-developer/skills/address-review-pr/check_replied.py +++ b/plugins/openshift-developer/skills/address-review-pr/check_replied.py @@ -24,6 +24,8 @@ BOT_SIGNATURES = [ "hypershift-jira-solve-ci[bot]", "hypershift-jira-solve-ci", + "github-actions", + "github-actions[bot]", ] # Text signature that appears in bot replies diff --git a/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/annotations.yaml new file mode 100644 index 000000000..f8afba0ac --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/annotations.yaml @@ -0,0 +1,6 @@ +expected_should_reply: false +expected_should_change_code: false +notes: > + Both github-actions and hypershift-jira-solve-ci are bot accounts that already + replied. The thread does NOT need another reply since the suggestion has already + been addressed by two separate bot replies. diff --git a/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/input.yaml b/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/input.yaml new file mode 100644 index 000000000..42632350d --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/input.yaml @@ -0,0 +1,18 @@ +prompt: | + Using the address-reviews skill, analyze this review thread from PR #8535 and determine if it needs a reply. The thread has these comments in order: + + Thread ID: PRRT_kwDOE7ekcc6GaP_v + Resolved: false, Outdated: true + + Comment 1 (2026-06-02T11:36:47Z) by bryan-cox: + "[suggestion] Recursive scan fragility. The recursive descent into subdirectories relies on isCRDYAML() content-based filtering to skip non-CRD files." + + Comment 2 (2026-06-02T11:50:11Z) by github-actions: + "Done. Added a doc comment on checkCRDsInDir documenting that non-CRD YAML files are filtered out via isCRDYAML content check. + --- + *AI-assisted response via Claude Code*" + + Comment 3 (2026-06-02T14:14:15Z) by hypershift-jira-solve-ci: + "Done. Added a doc comment on checkCRDsInDir documenting that non-CRD YAML files are filtered via isCRDYAML content-based detection." + + Should you post another reply to this thread? Explain your reasoning about each commenter's identity. diff --git a/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/reference-analysis.json new file mode 100644 index 000000000..e1c3d79b2 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-001-duplicate-bot-reply/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "SUGGESTION", + "should_reply": false, + "should_change_code": false, + "should_push": false, + "should_filter": true, + "priority_order": null, + "reply_draft": null, + "rationale": "No reply needed. The thread already has two responses acknowledging the suggestion. Comment 1 is by bryan-cox (the human reviewer who raised the suggestion). Comment 2 is by github-actions — a bot account that posted an AI-assisted 'Done' reply. Comment 3 is by hypershift-jira-solve-ci — another bot/CI account that posted a near-identical AI-assisted 'Done' reply. Both bot responses already confirmed the suggestion was addressed (doc comment added to checkCRDsInDir). Posting a third reply would be redundant and spammy. The thread is also marked as outdated, meaning the underlying code has changed since the suggestion was made. There is nothing unresolved that requires further human engagement." +} diff --git a/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/annotations.yaml new file mode 100644 index 000000000..0df798da1 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/annotations.yaml @@ -0,0 +1,6 @@ +expected_should_reply: false +expected_should_change_code: false +notes: > + Severe duplication problem — the thread was already addressed by the first + reply but kept getting re-processed. 28 bot replies on a single suggestion. + Thread is resolved. No further reply needed. diff --git a/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/input.yaml b/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/input.yaml new file mode 100644 index 000000000..17b6cc34f --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/input.yaml @@ -0,0 +1,14 @@ +prompt: | + Using the address-reviews skill, analyze this review thread timeline from PR #8535. The first 5 of 28 comments are shown: + + Thread ID: PRRT_kwDOE7ekcc6GaQAO + Resolved: true, Outdated: true + + [2026-06-02T11:36:47Z] bryan-cox: "[suggestion] Recursive scan fragility..." + [2026-06-02T11:50:11Z] github-actions: "Done. Added a doc comment on checkCRDsInDir..." + [2026-06-02T14:14:15Z] hypershift-jira-solve-ci: "Done. Added a doc comment on checkCRDsInDir..." + [2026-06-02T17:07:22Z] hypershift-jira-solve-ci: "Done. Added a doc comment on checkCRDsInDir..." + [2026-06-02T20:07:16Z] hypershift-jira-solve-ci: "Done. Documented the assumption in the checkCRDsInDir function comment..." + ... (23 more identical bot replies follow, posted every ~3 hours through June 18) + + This thread has been resolved. Should you reply? What went wrong here? diff --git a/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/reference-analysis.json new file mode 100644 index 000000000..fb4dd5746 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-002-duplicate-extreme/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "SUGGESTION", + "should_reply": false, + "should_change_code": false, + "should_push": false, + "should_filter": true, + "priority_order": null, + "reply_draft": null, + "rationale": "Thread is resolved and outdated. The original suggestion from bryan-cox was addressed by the first bot reply, but hypershift-jira-solve-ci then entered a reply loop, posting 23+ duplicate responses every ~3 hours from June 2 through June 18. This is a bot misconfiguration — likely re-triggering on its own replies or on webhook events without deduplication. No human action is needed: the thread is already resolved, the suggestion was already addressed, and adding another reply would only contribute more noise to an already spammed thread. This thread should be filtered out entirely." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/annotations.yaml new file mode 100644 index 000000000..f3eff4303 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/annotations.yaml @@ -0,0 +1,7 @@ +expected_category: QUESTION +expected_should_reply: true +expected_should_change_code: false +acceptable_categories: [QUESTION] +notes: > + Interrogative phrasing — "why do we need" and "is this how" are questions, + not imperative requests. Should reply with explanation, not code changes. diff --git a/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/input.yaml b/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/input.yaml new file mode 100644 index 000000000..47b3157fd --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Using the address-reviews skill categories (ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION), categorize this real comment from PR #8535. State the category name explicitly: + + Author: bryan-cox + Comment: "If there is already a library at openshift/crd-schema-checker, why do we need the go files? Is this how that repo says to set it up?" + Type: issue_comment (general PR comment) diff --git a/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/reference-analysis.json new file mode 100644 index 000000000..9ee2a2002 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-003-categorize-question/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "QUESTION", + "should_reply": true, + "should_change_code": false, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Good question — the Go files are typically needed because openshift/crd-schema-checker is consumed as a Go module dependency, and the Go files provide the integration point (e.g., test harness or CLI invocation) that wires the checker into this repo's CI or validation workflow. The library provides the checking logic, but you still need local Go code to call it with the correct schema paths and configuration for this project. That said, I'll double-check the crd-schema-checker repo's recommended setup instructions to confirm this follows their documented integration pattern.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "The reviewer is asking two clarifying questions: (1) why Go files are needed alongside an existing library, and (2) whether this follows the upstream repo's recommended setup. This is a QUESTION, not a direct change request — the reviewer wants to understand the rationale before deciding if changes are needed. A reply explaining the integration pattern is appropriate. No code changes or pushes are warranted until the reviewer confirms whether the explanation is satisfactory or requests a different approach." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/annotations.yaml new file mode 100644 index 000000000..536557b05 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/annotations.yaml @@ -0,0 +1,7 @@ +expected_category: QUESTION +expected_should_reply: true +expected_should_change_code: false +acceptable_categories: [QUESTION] +notes: > + Design question about architectural choice. "Why do we need" is interrogative, + not imperative. Should reply with explanation, not make code changes. diff --git a/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/input.yaml b/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/input.yaml new file mode 100644 index 000000000..4285e674c --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Using the address-reviews skill categories (ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION), categorize this real comment from PR #8535. State the category name explicitly: + + Author: bryan-cox + Comment: "Why do we need our own main.go. Why are we not just calling the cli from https://github.com/kubernetes-sigs/crdify?" + Type: issue_comment (general PR comment) diff --git a/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/reference-analysis.json new file mode 100644 index 000000000..f39bfefb5 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-004-categorize-question-2/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "QUESTION", + "should_reply": true, + "should_change_code": false, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": null, + "rationale": "The comment is phrased as two questions asking for justification of a design decision: why a custom main.go exists instead of calling the CLI from the upstream kubernetes-sigs/crdify package. While there is an implicit suggestion that the upstream CLI should be used instead, the primary form is a QUESTION seeking rationale. No reply draft is provided because answering requires knowledge of the specific technical reasons the PR author chose to create a custom main.go (e.g., custom flags, additional logic, embedding in a larger tool), which is not available from this comment alone. The PR author or someone with context on the implementation should respond." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/annotations.yaml new file mode 100644 index 000000000..c55ff501a --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/annotations.yaml @@ -0,0 +1,8 @@ +expected_category: CHANGE_REQUEST +expected_should_reply: true +expected_should_change_code: true +acceptable_categories: [CHANGE_REQUEST, BLOCKING, SUGGESTION] +notes: > + Reviewer indicates a policy that may require updating code behavior. + The reviewer is saying a check should be disabled, implying a code change. + CHANGE_REQUEST, BLOCKING, or SUGGESTION are all acceptable. diff --git a/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/input.yaml b/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/input.yaml new file mode 100644 index 000000000..9c0a3f1d3 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/input.yaml @@ -0,0 +1,7 @@ +prompt: | + Using the address-reviews skill categories (ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION), categorize this real review comment from PR #8535. State the category name explicitly: + + Author: JoelSpeed + Comment: "We disabled this check in o/api as while it is technically breaking, we allow it in most instances" + Type: review_thread (inline code comment) + File: hack/tools/crd-schema-check/main.go diff --git a/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/reference-analysis.json new file mode 100644 index 000000000..6901ea8cf --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-005-categorize-change-request/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "CHANGE_REQUEST", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Done. Disabled this check to align with o/api. Since it's technically breaking but allowed in most instances, keeping it enabled here would create unnecessary friction.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "The reviewer (JoelSpeed) is pointing out that this same check was intentionally disabled in openshift/api because, while technically a breaking change, it is accepted in most cases. This is an implicit CHANGE_REQUEST — the reviewer is signaling that this check should be disabled here as well to stay consistent with the upstream policy. It requires a code change (disabling the check) and a reply acknowledging alignment with the o/api decision." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/annotations.yaml new file mode 100644 index 000000000..3e0575544 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/annotations.yaml @@ -0,0 +1,7 @@ +expected_category: CHANGE_REQUEST +expected_should_reply: true +expected_should_change_code: true +acceptable_categories: [CHANGE_REQUEST, BLOCKING] +notes: > + Dead code where tests validate a path that never runs is a correctness + issue requiring a code change. Either CHANGE_REQUEST or BLOCKING is acceptable. diff --git a/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/input.yaml b/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/input.yaml new file mode 100644 index 000000000..b196051df --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/input.yaml @@ -0,0 +1,7 @@ +prompt: | + Using the address-reviews skill categories (ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION), categorize this real review comment from PR #8535. State the category name explicitly: + + Author: jparrill + Comment: "compareCRDs() exists at the bottom of this file as a testable wrapper around crdifyRunner.Run(), but this line calls the runner directly — so compareCRDs() is never executed in production. The tests for compareCRDs validate a code path that never runs. Either call compareCRDs(oldCRD, newCRD, crdifyRunner) here instead, or remove compareCRDs() and adjust the tests to call crdifyRunner.Run() directly." + Type: review_thread (inline code comment) + File: hack/tools/crd-schema-check/main.go diff --git a/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/reference-analysis.json new file mode 100644 index 000000000..eb2405605 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-006-categorize-dead-code/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "CHANGE_REQUEST", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Done. Updated to call `compareCRDs(oldCRD, newCRD, crdifyRunner)` instead of calling `crdifyRunner.Run()` directly. This ensures the production code path matches what the tests validate, eliminating the dead-code gap.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "This is a CHANGE_REQUEST: the reviewer identified a concrete bug where compareCRDs() is defined and tested but never called in production — the main code calls crdifyRunner.Run() directly, making the tests validate a code path that never executes. The reviewer gives two explicit fix options (call compareCRDs or remove it). This requires a code change and a reply acknowledging the fix." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/annotations.yaml new file mode 100644 index 000000000..ca6ba0d00 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/annotations.yaml @@ -0,0 +1,8 @@ +expected_category: SUGGESTION +expected_should_reply: true +expected_should_change_code: true +acceptable_categories: [CHANGE_REQUEST, SUGGESTION] +notes: > + Concrete code improvement request with a proposed fix. Either + CHANGE_REQUEST or SUGGESTION is acceptable since it includes + both the problem and the specific fix. diff --git a/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/input.yaml b/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/input.yaml new file mode 100644 index 000000000..85432788d --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/input.yaml @@ -0,0 +1,7 @@ +prompt: | + Using the address-reviews skill categories (ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION), categorize this real review comment from PR #8535. State the category name explicitly: + + Author: coderabbitai + Comment: "Move process success output off stdout. Line 104 prints to stdout from main.go; this should not emit non-JSON process output. Proposed fix: fmt.Fprintf(os.Stderr, ...) instead of fmt.Printf(...)" + Type: review_thread (inline code comment) + File: hack/tools/crd-schema-check/main.go diff --git a/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/reference-analysis.json new file mode 100644 index 000000000..18697ced7 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-007-categorize-suggestion/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "CHANGE_REQUEST", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Done. Switched `fmt.Printf` to `fmt.Fprintf(os.Stderr, ...)` at line 104 in `hack/tools/crd-schema-check/main.go`. Process success output should go to stderr to keep stdout clean for JSON output.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "This is a CHANGE_REQUEST: coderabbitai identifies a concrete problem (non-JSON output on stdout) and proposes a specific code fix (use fmt.Fprintf(os.Stderr, ...) instead of fmt.Printf). It is actionable, has a clear proposed fix, and represents a legitimate correctness concern about stdout cleanliness. Not filtered because the review raises a valid issue worth addressing." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/annotations.yaml new file mode 100644 index 000000000..a29aff330 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/annotations.yaml @@ -0,0 +1,7 @@ +expected_category: ACTION_INSTRUCTION +expected_should_reply: true +expected_should_change_code: false +acceptable_categories: [ACTION_INSTRUCTION] +notes: > + "The PR description needs updated" is a repo-level operation on the + PR itself, not a code change. ACTION_INSTRUCTION is the only correct category. diff --git a/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/input.yaml b/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/input.yaml new file mode 100644 index 000000000..9c45d8908 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Using the address-reviews skill categories (ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION), categorize this real comment from PR #8535. State the category name explicitly: + + Author: bryan-cox + Comment: "The PR description needs updated" + Type: issue_comment (general PR comment) diff --git a/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/reference-analysis.json new file mode 100644 index 000000000..278720467 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-008-categorize-action-instruction/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "ACTION_INSTRUCTION", + "should_reply": true, + "should_change_code": false, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Done. Updated the PR description. This was a direct request from the reviewer to improve the PR metadata.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "The comment is a direct instruction to update the PR description text, not a code change request. Categorized as ACTION_INSTRUCTION because the reviewer is telling the author what to do ('needs updated'). No code changes or pushes are needed since this concerns PR metadata only. A reply is appropriate to acknowledge the instruction." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/annotations.yaml new file mode 100644 index 000000000..a3f365a4f --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/annotations.yaml @@ -0,0 +1,9 @@ +expected_priority_order: [A, B, C] +expected_category_a: ACTION_INSTRUCTION +expected_category_b: CHANGE_REQUEST +expected_category_c: QUESTION +notes: > + Comment A (update PR description) is prioritized first as ACTION_INSTRUCTION. + Comment B (dead code fix) is prioritized second as CHANGE_REQUEST or BLOCKING. + Comment C (architecture question) is prioritized last as QUESTION. + Key check is the ordering: A first, B second, C last. diff --git a/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/input.yaml b/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/input.yaml new file mode 100644 index 000000000..9a30de35f --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/input.yaml @@ -0,0 +1,11 @@ +prompt: | + Using the address-reviews skill priority order (ACTION_INSTRUCTION → BLOCKING → CHANGE_REQUEST → QUESTION → SUGGESTION), prioritize these real comments from PR #8535. List them in the order you would address them: + + Comment A (bryan-cox, issue_comment): + "The PR description needs updated" + + Comment B (jparrill, review_thread on main.go): + "compareCRDs() exists at the bottom of this file as a testable wrapper around crdifyRunner.Run(), but this line calls the runner directly — so compareCRDs() is never executed in production." + + Comment C (bryan-cox, issue_comment): + "Why do we need our own main.go. Why are we not just calling the cli from https://github.com/kubernetes-sigs/crdify?" diff --git a/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/reference-analysis.json new file mode 100644 index 000000000..954c9d66f --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-009-prioritize-mixed/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "BLOCKING", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": ["A", "B", "C"], + "reply_draft": "Addressing all three comments:\n\n**Comment A (ACTION_INSTRUCTION):** Done. Updated the PR description to accurately reflect the current implementation.\n\n**Comment B (BLOCKING):** Done. Changed the call site to use `compareCRDs()` instead of calling `crdifyRunner.Run()` directly. This ensures the testable wrapper is exercised in production and tests actually cover the real code path.\n\n**Comment C (QUESTION):** We maintain our own `main.go` rather than calling the crdify CLI directly because we need a testable wrapper (`compareCRDs()`) around the runner and control over how arguments and configuration are passed. Importing the CLI binary would couple us to its argument parsing and wouldn't allow us to intercept or customize the execution flow.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "Comment A is ACTION_INSTRUCTION (author directing a PR description update — highest priority). Comment B is BLOCKING (jparrill identified dead code: compareCRDs() is defined as a testable wrapper but never called in production because the call site invokes crdifyRunner.Run() directly — this means tests don't cover the actual production path, a significant correctness issue). Comment C is QUESTION (bryan-cox asking about the architectural choice of maintaining a custom main.go vs using crdify's CLI). Priority order follows ACTION_INSTRUCTION → BLOCKING → QUESTION." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/annotations.yaml new file mode 100644 index 000000000..bbfd560ab --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/annotations.yaml @@ -0,0 +1,6 @@ +expected_should_filter: false +expected_should_reply: true +notes: > + coderabbitai comments should be kept and addressed since coderabbit + provides code review insights. The comment identifies a real bug + (missing continue statement causing fallthrough). diff --git a/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/input.yaml b/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/input.yaml new file mode 100644 index 000000000..dadf3bc05 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Using the address-reviews skill filtering rules, should this comment be addressed or filtered out? + + Author: coderabbitai + Comment: "Missing continue after 'skipping comparison' log. The code logs 'skipping comparison' but falls through to Compare() on line 175 anyway." + Type: review_thread (inline code comment) diff --git a/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/reference-analysis.json new file mode 100644 index 000000000..f6c7b40ba --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-010-filter-coderabbit-kept/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "BLOCKING", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": ["BLOCKING"], + "reply_draft": "Done. Added `continue` after the 'skipping comparison' log to prevent falling through to `Compare()`. The missing control flow would cause comparisons to run on items that should be skipped.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "coderabbitai is explicitly kept per filtering rules (only openshift-ci-robot and openshift-ci are filtered as bots). The comment identifies a genuine bug: a missing 'continue' statement causes code to fall through to Compare() even after logging that the comparison should be skipped. This is a logic error categorized as BLOCKING (bug)." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-011-reply-format/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-011-reply-format/annotations.yaml new file mode 100644 index 000000000..a2a2370d5 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-011-reply-format/annotations.yaml @@ -0,0 +1,6 @@ +expected_reply_contains: ["Done", "AI-assisted response via Claude Code"] +expected_reply_max_sentences: 3 +notes: > + Reply must start with "Done.", be concise (3 sentences max before + signature), mention calling compareCRDs instead of the runner directly, + and include the "AI-assisted response via Claude Code" footer. diff --git a/plugins/utils/evals/cases/address-reviews/case-011-reply-format/input.yaml b/plugins/utils/evals/cases/address-reviews/case-011-reply-format/input.yaml new file mode 100644 index 000000000..9d5931912 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-011-reply-format/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Following the address-reviews skill reply template ("Done. [1-line what changed]. [Optional 1-line why]" with "---\n*AI-assisted response via Claude Code*" footer), draft the exact reply you would post for this real review comment from PR #8535: + + Author: jparrill + Comment: "compareCRDs() exists at the bottom of this file as a testable wrapper around crdifyRunner.Run(), but this line calls the runner directly — so compareCRDs() is never executed in production. Either call compareCRDs(oldCRD, newCRD, crdifyRunner) here instead." + File: hack/tools/crd-schema-check/main.go diff --git a/plugins/utils/evals/cases/address-reviews/case-011-reply-format/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-011-reply-format/reference-analysis.json new file mode 100644 index 000000000..f588f1753 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-011-reply-format/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "CHANGE_REQUEST", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Done. Replaced direct `crdifyRunner.Run()` call with `compareCRDs(oldCRD, newCRD, crdifyRunner)`. This ensures the testable wrapper is actually exercised in production, not just in tests.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "The reviewer identifies dead code: compareCRDs() exists as a testable wrapper but is bypassed in production. This is a clear change request with a specific fix (call the wrapper instead of the runner directly). The change is straightforward and the reviewer's reasoning is sound — if the wrapper is only called in tests, any bug in how it invokes the runner would go undetected in production." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/annotations.yaml new file mode 100644 index 000000000..0bcbd0e5a --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/annotations.yaml @@ -0,0 +1,6 @@ +expected_should_push: false +expected_should_change_code: true +notes: > + The response should plan to make the code change and post a reply, + but explicitly state it will skip pushing because the instruction + says pushing is handled automatically. Tests the CI override behavior. diff --git a/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/input.yaml b/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/input.yaml new file mode 100644 index 000000000..419c5471e --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/input.yaml @@ -0,0 +1,8 @@ +prompt: | + IMPORTANT: Do NOT push changes - pushing will be handled automatically after you finish. + + Using the address-reviews skill, process this real comment from PR #8535. Describe what steps you would take, specifically addressing what happens at the push step: + + Author: jparrill + Comment: "compareCRDs() exists at the bottom of this file as a testable wrapper around crdifyRunner.Run(), but this line calls the runner directly — so compareCRDs() is never executed in production." + File: hack/tools/crd-schema-check/main.go diff --git a/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/reference-analysis.json new file mode 100644 index 000000000..4e8a08712 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-012-ci-push-override/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "CHANGE_REQUEST", + "should_reply": true, + "should_change_code": true, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Done. Changed the direct `crdifyRunner.Run()` call to use `compareCRDs()` instead. The wrapper function was already defined for testability but was being bypassed in the main execution path.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "The reviewer identified that compareCRDs() exists as a testable wrapper around crdifyRunner.Run(), but the main code path calls the runner directly, making the wrapper dead code in production. This is a legitimate code defect that should be fixed by routing through the wrapper. Push is set to false because the instructions explicitly state pushing will be handled automatically." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/annotations.yaml new file mode 100644 index 000000000..e1736309f --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/annotations.yaml @@ -0,0 +1,7 @@ +expected_category: QUESTION +expected_should_reply: true +expected_should_change_code: false +notes: > + This is a directional question about project strategy. The response + should plan to reply with an explanation or ask for user guidance, + and would NOT immediately make code changes. diff --git a/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/input.yaml b/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/input.yaml new file mode 100644 index 000000000..07976f806 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Using the address-reviews skill response rules (questions get explanations, only imperative requests get code changes), what action would you take for this real comment from PR #8535? State clearly whether you would make code changes or only reply: + + Author: JoelSpeed + Comment: "We are moving away from crd-schema-checker towards the upstream crdify project, and in fact, in o/api we have disabled the majority of checks in crd-schema-checker because they are implemented in crdify already. Could you weigh in on whether to switch this PR to crdify?" + Type: issue_comment (general PR comment) diff --git a/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/reference-analysis.json new file mode 100644 index 000000000..b373406f4 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-013-response-question-no-change/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "QUESTION", + "should_reply": true, + "should_change_code": false, + "should_push": false, + "should_filter": false, + "priority_order": null, + "reply_draft": "Thanks for the heads-up on the migration to crdify. Since o/api has already disabled most crd-schema-checker checks in favor of crdify, it makes sense to align this PR with that direction. I'm happy to switch this PR over to use crdify instead — could you point me to any examples or documentation for integrating crdify in a similar context so I can make the transition smoothly?\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "JoelSpeed is asking for the author's opinion ('Could you weigh in on whether to switch this PR to crdify?') — this is a question seeking input, not an imperative instruction to change code. Per address-reviews rules, questions receive explanations and discussion, not code changes. The reply acknowledges the migration direction, expresses willingness to switch, and asks for guidance to move forward. No code changes should be made until the direction is confirmed and specifics are provided." +} \ No newline at end of file diff --git a/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/annotations.yaml b/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/annotations.yaml new file mode 100644 index 000000000..4ddeb313e --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/annotations.yaml @@ -0,0 +1,6 @@ +expected_category: CHANGE_REQUEST +expected_should_reply: true +expected_should_change_code: true +notes: > + The reviewer gave a direct imperative instruction with specific + alternatives to implement. This requires a code change, not just a reply. diff --git a/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/input.yaml b/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/input.yaml new file mode 100644 index 000000000..f727747ce --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/input.yaml @@ -0,0 +1,6 @@ +prompt: | + Using the address-reviews skill response rules, would you make a code change or only reply for this real comment? State clearly: + + Author: jparrill + Comment: "Either call compareCRDs(oldCRD, newCRD, crdifyRunner) here instead, or remove compareCRDs() and adjust the tests to call crdifyRunner.Run() directly." + File: hack/tools/crd-schema-check/main.go diff --git a/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/reference-analysis.json b/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/reference-analysis.json new file mode 100644 index 000000000..5b973c045 --- /dev/null +++ b/plugins/utils/evals/cases/address-reviews/case-014-response-imperative-change/reference-analysis.json @@ -0,0 +1,10 @@ +{ + "category": "CHANGE_REQUEST", + "should_reply": true, + "should_change_code": true, + "should_push": true, + "should_filter": false, + "priority_order": ["CHANGE_REQUEST"], + "reply_draft": "Done. Called `compareCRDs(oldCRD, newCRD, crdifyRunner)` directly at the call site instead of inlining the logic. Keeps the helper reusable for tests.\n\n---\n*AI-assisted response via Claude Code*", + "rationale": "The reviewer uses imperative language ('call...instead', 'remove', 'adjust') presenting two concrete alternatives for restructuring the code. Per response rules, imperative directives like 'call', 'remove', and 'adjust' qualify as explicit change requests requiring code modification, not just a reply. The reply draft assumes the first option (calling compareCRDs directly) as it avoids removing the function and rewriting tests, but the actual choice should be validated against the code context." +} diff --git a/plugins/utils/evals/eval-address-reviews.yaml b/plugins/utils/evals/eval-address-reviews.yaml new file mode 100644 index 000000000..4a97237f9 --- /dev/null +++ b/plugins/utils/evals/eval-address-reviews.yaml @@ -0,0 +1,349 @@ +name: address-reviews-eval +description: Evaluate the address-reviews skill's ability to categorize PR review comments, prevent duplicate replies, prioritize by type, and format responses correctly +skill: utils:address-reviews + +execution: + mode: case + arguments: "{prompt}" + timeout: 120 + max_budget_usd: 1.00 + +runner: + type: claude-code + plugin_dirs: + - plugins/utils + system_prompt: | + Analyze the PR review comment(s) provided and write your analysis to a file + called analysis.json in the current working directory. The file must contain + valid JSON with exactly these fields: + - category: string — one of ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION, or null if not applicable + - should_reply: boolean — whether a new reply should be posted + - should_change_code: boolean — whether code changes are needed + - should_push: boolean — whether changes should be pushed (false if CI override) + - should_filter: boolean — whether the comment should be filtered out + - priority_order: array of strings or null — ordered list if prioritizing multiple comments + - reply_draft: string or null — the draft reply if should_reply is true + - rationale: string — brief explanation of the decision + Do not wrap the JSON in markdown code fences in the file. + +models: + judge: claude-opus-4-6 + +permissions: + allow: + - "Read" + - "Write" + - "Skill" + - "Bash" + deny: + - "Agent" + - "mcp__*" + +mlflow: + experiment: address-reviews-eval + +dataset: + path: cases/address-reviews + schema: | + Each case directory contains: + - input.yaml: YAML file with fields: + - 'prompt' — the full prompt text for the skill + - annotations.yaml: Expected results and metadata: + - 'expected_category': one of ACTION_INSTRUCTION, BLOCKING, CHANGE_REQUEST, QUESTION, SUGGESTION (optional) + - 'expected_should_reply': boolean (optional) + - 'expected_should_change_code': boolean (optional) + - 'expected_should_push': boolean (optional) + - 'expected_should_filter': boolean (optional) + - 'expected_priority_order': list of comment labels in expected order (optional) + - 'expected_starts_with': expected reply prefix (optional) + - 'expected_contains_footer': boolean (optional) + - 'expected_footer_text': expected footer string (optional) + - 'acceptable_categories': list of acceptable category values (optional) + - 'notes': free text explaining the expected behavior + - reference-analysis.json: Gold reference output from the initial run. + Used as baseline for regression detection via --baseline flag. + Contains the same fields as analysis.json. + +outputs: + - path: "analysis.json" + schema: | + JSON object with fields: + - category: string or null + - should_reply: boolean + - should_change_code: boolean + - should_push: boolean + - should_filter: boolean + - priority_order: array or null + - reply_draft: string or null + - rationale: string + +traces: + stdout: true + stderr: true + events: false + metrics: true + +judges: + - name: valid_output_json + description: Verify output is valid JSON with all required fields + check: | + import json + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Invalid JSON: {e}") + required = ["category", "should_reply", "should_change_code", "should_push", "should_filter", "rationale"] + missing = [f for f in required if f not in data] + if missing: + return (False, f"Missing fields: {', '.join(missing)}") + return (True, "Valid JSON with all required fields") + + - name: category_correct + description: Verify predicted category matches expected value or an acceptable alternative + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_category" not in ann: + return (True, "No expected_category in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + predicted = data.get("category", "") + expected = ann.get("expected_category", "") + acceptable = ann.get("acceptable_categories", [expected]) + if predicted in acceptable: + match = "exact" if predicted == expected else "acceptable" + return (True, f"{match}: predicted={predicted}, expected={expected}") + return (False, f"predicted={predicted}, expected={expected}, acceptable={acceptable}") + + - name: should_reply_correct + description: Verify should_reply decision matches expected value + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_should_reply" not in ann: + return (True, "No expected_should_reply in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + predicted = data.get("should_reply") + expected = ann.get("expected_should_reply") + if predicted == expected: + return (True, f"should_reply={predicted} matches expected") + return (False, f"should_reply={predicted}, expected={expected}") + + - name: should_change_code_correct + description: Verify should_change_code decision matches expected value + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_should_change_code" not in ann: + return (True, "No expected_should_change_code in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + predicted = data.get("should_change_code") + expected = ann.get("expected_should_change_code") + if predicted == expected: + return (True, f"should_change_code={predicted} matches expected") + return (False, f"should_change_code={predicted}, expected={expected}") + + - name: should_push_correct + description: Verify should_push respects CI override instructions + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_should_push" not in ann: + return (True, "No expected_should_push in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + predicted = data.get("should_push") + expected = ann.get("expected_should_push") + if predicted == expected: + return (True, f"should_push={predicted} matches expected") + return (False, f"should_push={predicted}, expected={expected}") + + - name: should_filter_correct + description: Verify should_filter decision matches expected value + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_should_filter" not in ann: + return (True, "No expected_should_filter in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + predicted = data.get("should_filter") + expected = ann.get("expected_should_filter") + if predicted == expected: + return (True, f"should_filter={predicted} matches expected") + return (False, f"should_filter={predicted}, expected={expected}") + + - name: priority_order_correct + description: Verify priority_order matches expected ordering + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_priority_order" not in ann: + return (True, "No expected_priority_order in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + predicted = data.get("priority_order") + expected = ann.get("expected_priority_order") + if predicted == expected: + return (True, f"priority_order={predicted} matches expected") + return (False, f"priority_order={predicted}, expected={expected}") + + - name: reply_format_correct + description: Verify reply draft matches expected format constraints + check: | + import json + ann = outputs.get("annotations", {}) + has_checks = ("expected_reply_contains" in ann + or "expected_reply_max_sentences" in ann + or "expected_starts_with" in ann + or "expected_contains_footer" in ann) + if not has_checks: + return (True, "No reply format expectations in annotations — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + try: + data = json.loads(content) + except Exception as e: + return (False, f"Cannot parse JSON: {e}") + reply = data.get("reply_draft", "") + if not reply: + return (False, "No reply_draft in output") + errors = [] + contains = ann.get("expected_reply_contains", []) + for token in contains: + if token not in reply: + errors.append(f"Reply missing required text '{token}'") + starts = ann.get("expected_starts_with") + if starts and not reply.startswith(starts): + errors.append(f"Reply does not start with '{starts}'") + footer = ann.get("expected_footer_text") + if ann.get("expected_contains_footer") and footer and footer not in reply: + errors.append(f"Reply missing footer '{footer}'") + max_sent = ann.get("expected_reply_max_sentences") + if isinstance(max_sent, int): + main = reply.split("---", 1)[0] + sentence_count = sum(main.count(ch) for ch in ".!?") + if sentence_count > max_sent: + errors.append(f"Reply has {sentence_count} sentences; max is {max_sent}") + if errors: + return (False, "; ".join(errors)) + return (True, "Reply format matches expectations") + + - name: analysis_quality + description: LLM judge assessing overall analysis accuracy and rationale quality + prompt: | + You are evaluating whether a PR review comment was correctly analyzed + by the address-reviews skill. + + The skill's analysis output: + + {% for path, content in outputs.files.items() if path.endswith('analysis.json') %} + {{ content }} + {% endfor %} + + Expected behavior from annotations: + + {{ annotations }} + + Evaluate on a 1-5 scale: + + Score 1: Core decision (should_reply, should_change_code, category) is wrong. + Rationale is absent or nonsensical. + Score 2: One core decision is correct but another is wrong. Rationale is + generic and doesn't reference the actual comment content. + Score 3: All core decisions are correct. Rationale mentions relevant aspects + of the comment but may be vague. + Score 4: All core decisions match expected values. Rationale specifically + references signal words or patterns from the comment that justify + the analysis. + Score 5: Perfect match on all fields. Rationale is precise, citing specific + phrases from the comment. All optional fields (priority_order, + reply_draft) are well-formed when present. + +thresholds: + valid_output_json: + min_pass_rate: 1.0 + category_correct: + min_pass_rate: 0.85 + should_reply_correct: + min_pass_rate: 0.90 + should_change_code_correct: + min_pass_rate: 0.90 + should_push_correct: + min_pass_rate: 1.0 + should_filter_correct: + min_pass_rate: 0.90 + priority_order_correct: + min_pass_rate: 0.85 + reply_format_correct: + min_pass_rate: 0.80 + analysis_quality: + min_mean: 3.5 From 031263378b16caa4af35c61d537ca7072f97e8db Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 22 Jun 2026 10:05:38 -0400 Subject: [PATCH 2/2] docs: rewrite evals AGENTS.md for eval harness format Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 15 +- docs/index.html | 4 +- evals/AGENTS.md | 337 ++++++------------ .../.claude-plugin/plugin.json | 2 +- plugins/utils/.claude-plugin/plugin.json | 2 +- 5 files changed, 132 insertions(+), 228 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index bea5f6470..15c3eda66 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -14,7 +14,11 @@ "description": "Go language server for code intelligence and refactoring", "version": "1.0.0", "category": "development", - "keywords": ["go", "gopls", "lsp"] + "keywords": [ + "go", + "gopls", + "lsp" + ] }, { "name": "prodsec-skills", @@ -25,7 +29,10 @@ "description": "Security guidance skills for AI coding assistants", "version": "1.0.0", "category": "security", - "keywords": ["security", "prodsec"] + "keywords": [ + "security", + "prodsec" + ] }, { "name": "git", @@ -148,7 +155,7 @@ "name": "utils", "source": "./plugins/utils", "description": "A generic utilities plugin serving as a catch-all for various helper commands", - "version": "0.0.12", + "version": "0.0.13", "category": "tooling", "keywords": [ "utilities", @@ -428,7 +435,7 @@ "name": "openshift-developer", "source": "./plugins/openshift-developer", "description": "Bundle of curated plugins, skills, and MCP servers useful to any OpenShift engineer", - "version": "1.1.2", + "version": "1.1.3", "category": "bundle", "keywords": [ "openshift", diff --git a/docs/index.html b/docs/index.html index c90205cb6..da145ce01 100644 --- a/docs/index.html +++ b/docs/index.html @@ -2364,7 +2364,7 @@

ai-helpers

{ "name": "openshift-developer", "description": "Bundle of curated plugins, skills, and MCP servers useful to any OpenShift engineer", - "version": "1.1.2", + "version": "1.1.3", "has_readme": true, "commands": [], "skills": [ @@ -2864,7 +2864,7 @@

ai-helpers

{ "name": "utils", "description": "A generic utilities plugin serving as a catch-all for various helper commands and agents", - "version": "0.0.12", + "version": "0.0.13", "has_readme": true, "commands": [ { diff --git a/evals/AGENTS.md b/evals/AGENTS.md index b75d9f8ca..50d8451ac 100644 --- a/evals/AGENTS.md +++ b/evals/AGENTS.md @@ -1,257 +1,154 @@ # Plugin Evals -Behavioral evals for ai-helpers plugins using [promptfoo](https://github.com/promptfoo/promptfoo) with the `anthropic:claude-agent-sdk` provider. +Case-based behavioral evals using the [agent-eval-harness](https://github.com/opendatahub-io/agent-eval-harness) format. Config files are prefixed with `eval-` (e.g., `eval-address-reviews.yaml`). -## Architecture Decisions +## Structure -### Provider: `anthropic:claude-agent-sdk` with Vertex AI -- The SDK provider spawns the Claude Code CLI as a subprocess, giving full agent behavior (tools, skills, plugins) -- Vertex AI auth via `CLAUDE_CODE_USE_VERTEX=true` — no Anthropic API key needed, uses GCP credentials -- `ANTHROPIC_VERTEX_PROJECT_ID` tells Claude Code which GCP project to bill - -### Co-located eval configs -- Eval configs live inside each plugin: `plugins//evals/*.yaml` -- Not in a central `evals/` directory — keeps tests next to the code they test -- The root `evals/` only holds the smoke test config and this file -- Adding evals for a new plugin: create `plugins//evals/.yaml` - -### Assertions -- **`skill-used` / `not-skill-used`**: verifies the agent invokes the correct skill and doesn't route to adjacent ones. Requires the SDK provider (not available with `exec:` providers). Skill names are namespaced: `plugin-name:skill-name` -- **`icontains`**: deterministic string match on agent output — cheap, no LLM judge -- **`llm-rubric`**: LLM-judged quality check — used for fuzzy assertions where exact output varies -- **`cost` / `latency`**: regression guards — fail if a test exceeds thresholds -- **No `output_format`**: we don't use `output_format: json_schema` because it bypasses the Skill tool invocation, which breaks `skill-used` assertions. The agent returns natural text with embedded JSON instead -- See [promptfoo assertion docs](https://www.promptfoo.dev/docs/configuration/expected-outputs/) for the full list of available assertion types - -### Test fixtures -- Issue descriptions for jira evals live in `plugins/jira/evals/fixtures/*.md` -- Referenced via `file://fixtures/.md` in yaml vars -- Plain markdown, not JSON — promptfoo loads them as string variables - -### Test metadata and tiering - -Every test case carries per-test metadata that describes its cost profile. Metadata must be on each test case directly — `defaultTest.metadata` is not used because `--filter-metadata` runs before defaultTest merging. - -Use YAML anchors (`&meta-fast`) and aliases (`*meta-fast`) to avoid repetition: - -```yaml -tests: - - description: "first test" - metadata: &meta-fast # anchor: defines the block - token-usage: medium - judge-size: none - tier: fast - ... - - - description: "second test" - metadata: *meta-fast # alias: reuses the block - ... -``` - -#### Metadata fields - -```yaml -metadata: - token-usage: small | medium | large # agent execution cost - judge-size: none | sonnet | opus # grading model for llm-rubric assertions - tier: fast | medium | heavy # computed from token-usage and judge-size +```text +plugins//evals/ + eval-.yaml # harness config + cases// + case-001-/ + input.yaml # prompt field consumed by the skill + annotations.yaml # expected values for deterministic judges + reference-.json # gold reference from initial run (for baseline regression) ``` -**`token-usage`** — how expensive the agent execution is, derived from the `type: cost` and `type: latency` assertion thresholds on each test: +## Key Concepts -| token-usage | cost threshold | latency threshold | typical test | -|-------------|----------------|-------------------|--------------------------------------| -| small | $0.50 | 30s | simple command, minimal tool calls | -| medium | $0.50 | 60s | single skill invocation with tool use| -| large | $0.60-1.20 | 120-180s | multi-tool workflow, script execution| +- **execution.mode: case** — one case directory per test, each with input + annotations +- **runner** — `type: claude-code` with `plugin_dirs` and a `system_prompt` telling the agent what file to write +- **judges** — deterministic (`check:` Python snippets returning `(bool, str)`) and LLM (`prompt:` rubrics scored 1-5) +- **thresholds** — `min_pass_rate` for deterministic judges, `min_mean` for LLM judges +- **regression detection** — run with `--baseline ` to compare against prior results; gold `reference-*.json` files in case dirs serve as the initial baseline +- **outputs** — the harness reads files written by the agent (e.g., `analysis.json`) and makes them available to judges via `outputs.files` +- **models.judge** — which model scores LLM judge prompts (e.g., `claude-opus-4-6`) -The `cost` assertion checks the agent execution cost only (not judge cost). - -**`judge-size`** — which model grades `llm-rubric` assertions: -- `none`: no LLM judge — only deterministic assertions (`icontains`, `skill-used`, `cost`, `latency`) -- `sonnet`: graded by `vertex:claude-sonnet-4-6` — cheaper, good for straightforward pass/fail -- `opus`: graded by `vertex:claude-opus-4-6` — more capable, for nuanced quality judgments - -The judge model is configured in `defaultTest.options.provider` per eval file. - -**`tier`** — computed from the other two, determines when the test runs: - -| token-usage | judge-size | tier | -|-------------|------------|--------| -| small | none | fast | -| medium | none | fast | -| large | none | medium | -| small | sonnet | fast | -| medium | sonnet | medium | -| large | sonnet | heavy | -| small | opus | medium | -| medium | opus | medium | -| large | opus | heavy | - -Rule: tier is the highest cost contributor across both dimensions. A `none` judge pulls the tier down (no grading cost), while `opus` judge or `large` agent cost pushes it up. - -#### Current test inventory - -| Test | token-usage | judge-size | tier | count | -|------------------------------|-------------|------------|--------|-------| -| hello-world/echo | small | none | fast | 3 | -| classify golden tests | medium | none | fast | 13 | -| classify ambiguous/routing | medium | opus | medium | 2 | -| jira/ready-to-solve | large | opus | heavy | 3 | -| jira/solve | large | opus | heavy | 4 | -| **Total** | | | | **25**| - -#### Running by tier - -```bash -# Fast only — 16 tests, deterministic -make eval-plugins EVAL_TIER=fast - -# Medium only — 2 tests, opus judge but cheap agent -make eval-plugins EVAL_TIER=medium - -# Heavy only — 8 tests, expensive agent + opus judge -make eval-plugins EVAL_TIER=heavy - -# All tiers (default) -make eval-plugins +## File Structure -# Combine with plugin filter -make eval-plugins EVAL_PLUGIN=code-review EVAL_TIER=fast +```text +plugins/ + code-review/evals/ + eval-classify-review-comment.yaml # 13 case-based classification tests + cases/classify-review-comment/ # case directories + utils/evals/ + eval-address-reviews.yaml # 14 case-based analysis tests + cases/address-reviews/ # case directories +evals/ + AGENTS.md # this file + budget.yaml # per-plugin budgets ``` -#### Budget planning - -Measured cost per full run (25 tests, opus agent, Vertex AI): **~$6** - -| Plugin | Tests | Tier | Actual cost | Per-test avg | -|---------------------|-------|-------------|-------------|--------------| -| hello-world | 3 | fast | $0.56 | $0.19 | -| code-review | 15 | fast/medium | $2.70 | $0.18 | -| jira/ready-to-solve | 3 | heavy | $1.62 | $0.54 | -| jira/solve | 5 | heavy | $0.73 | $0.15 | -| **Total** | **26**| | **$5.61** | **$0.22** | +## Current Eval Configs -By tier: +| Plugin | Config | Cases | What it tests | +|--------|--------|-------|---------------| +| code-review | `eval-classify-review-comment.yaml` | 13 | Severity/topic classification | +| utils | `eval-address-reviews.yaml` | 14 | Comment categorization, dedup, reply format | -| Tier | Tests | Actual cost | -|--------|-------|-------------| -| fast | 16 | ~$3.26 | -| medium | 2 | ~$0.33 | -| heavy | 8 | ~$2.02 | - -Agent execution is the dominant cost (~99%). Judge calls are negligible even with opus. - -When setting `cost` thresholds per test, use ~2x the observed per-test max: -- `small`: observed ~$0.20, threshold $0.50 -- `medium`: observed ~$0.21, threshold $0.50 -- `large`: observed $0.27-0.62, threshold $0.60-1.20 - -### Cost and latency thresholds -Per-test thresholds in `defaultTest.assert` catch regressions without flaking: - -| Plugin | Latency | Cost | token-usage | -|---------------------|---------|-------|-------------| -| hello-world | 30s | $0.50 | small | -| code-review | 60s | $0.50 | medium | -| jira/ready-to-solve | 3min | $1.20 | large | -| jira/solve | 2min | $0.60 | large | +## Adding Evals for a New Plugin -### Per-plugin budgets -Budgets are defined centrally in `evals/budget.yaml` with `allowed` (admin-set cap) and `current` (sum of cost thresholds across all tests) per plugin: +1. Create `plugins//evals/eval-.yaml` — use `eval-classify-review-comment.yaml` or `eval-address-reviews.yaml` as templates +2. Create case directories under `plugins//evals/cases//` +3. Each case needs: + - `input.yaml` — skill input (e.g., a `prompt` field) + - `annotations.yaml` — expected values for deterministic judges + - `reference-.json` — gold reference from running the case through `claude --print` +4. Write deterministic judges (`check:` blocks) for core decisions, LLM judges (`prompt:` blocks) for quality +5. Set thresholds: `min_pass_rate` for deterministic, `min_mean` for LLM +6. Add budget entry to `evals/budget.yaml` -| Plugin | Allowed | Current (sum of thresholds) | -|---------------------|---------|----------------------------| -| hello-world | $1.50 | $1.50 | -| code-review | $8.00 | $7.50 | -| jira | $7.00 | $6.00 | -| **Total** | **$16.50** | **$15.00** | +## Writing Judges -`evals/budget.yaml` is the single source of truth for the eval cost model. It contains: -- **`orderings`**: defines `small < medium < large`, `none < sonnet < opus`, `fast < medium < heavy` -- **`token-usage`**: maps each size to its `max-cost` and `max-latency` thresholds -- **`tiers`**: maps each tier to its `max-token-usage` and `max-judge-size` bounds -- **`budgets`**: per-plugin `allowed` (admin cap) and `current` (sum of thresholds) -- **Linter validation rules**: documented as comments — how to validate metadata consistency, threshold bounds, and budget compliance +### Deterministic judges -## File Structure +Python snippets that return `(bool, message)`. Access the agent's output files via `outputs.get("files", {})` and `outputs.get("modified_files", {})`. Access expected values via `outputs.get("annotations", {})`. -```text -plugins/ - hello-world/evals/ - echo.yaml # 3 command output tests - code-review/evals/ - classify-review-comment.yaml # 15 skill classification tests - jira/evals/ - ready-to-solve.yaml # 3 readiness validation tests - solve.yaml # 4 phase-level analysis tests - fixtures/ # test issue descriptions (.md) -evals/ - AGENTS.md # this file - budget.yaml # shared cost definitions + per-plugin budgets - promptfooconfig.yaml # smoke test -package.json # pins promptfoo + claude-agent-sdk versions -.github/workflows/eval-plugins.yml # CI: evals on PRs with changed plugins +```yaml +judges: + - name: category_correct + description: Verify predicted category matches expected + check: | + import json + ann = outputs.get("annotations", {}) + if "expected_category" not in ann: + return (True, "No expected_category — skipped") + files = outputs.get("files", {}) + modified = outputs.get("modified_files", {}) + all_f = {**files, **modified} + json_files = {k: v for k, v in all_f.items() if k.endswith("analysis.json")} + if not json_files: + return (False, "No analysis.json found") + content = list(json_files.values())[0] + data = json.loads(content) + predicted = data.get("category", "") + expected = ann.get("expected_category", "") + acceptable = ann.get("acceptable_categories", [expected]) + if predicted in acceptable: + return (True, f"predicted={predicted} matches") + return (False, f"predicted={predicted}, expected={expected}") ``` -## Running Evals - -### Prerequisites -- `claude` CLI installed and authenticated -- Node.js 22+ -- `gcloud auth application-default login` (for Vertex AI) -- `ANTHROPIC_VERTEX_PROJECT_ID` set +### LLM judges -### Commands +Prompt templates scored 1-5. Use Jinja2 to inject outputs and annotations. -```bash -# Run all plugin evals (parallel) -ANTHROPIC_VERTEX_PROJECT_ID= make eval-plugins +```yaml +judges: + - name: analysis_quality + description: LLM judge assessing overall accuracy + prompt: | + The skill's output: + {% for path, content in outputs.files.items() if path.endswith('analysis.json') %} + {{ content }} + {% endfor %} + + Expected behavior: + {{ annotations }} + + Score 1-5 based on accuracy and rationale quality. +``` -# Single plugin -make eval-plugins EVAL_PLUGIN=hello-world +### Thresholds -# Filter by test description -make eval-plugins EVAL_PLUGIN=code-review EVAL_FILTER=nitpick +```yaml +thresholds: + category_correct: + min_pass_rate: 0.85 # deterministic: fraction of cases that must pass + analysis_quality: + min_mean: 3.5 # LLM: minimum average score across cases +``` -# Filter by tier -make eval-plugins EVAL_TIER=fast +## Gold References and Regression Detection -# Combine plugin and tier -make eval-plugins EVAL_PLUGIN=code-review EVAL_TIER=fast +Each case directory includes a `reference-.json` file containing the output from an initial validated run. These are generated by running the skill through `claude --print` and saving the result. -# Multiple runs with pass rate threshold -make eval-plugins EVAL_REPEAT=3 EVAL_PASS_RATE_THRESHOLD=80 +### How evals detect regressions -# JUnit XML output (for CI) -EVAL_OUTPUT_DIR=./eval-results make eval-plugins +**Deterministic judges are the hard floor.** Each judge checks a specific decision against the expected values in `annotations.yaml`. If a plugin change causes `category_correct` to drop from 13/14 passing to 10/14, that's 71% — below the 85% threshold. The eval fails. -# View results in browser -npx promptfoo view -``` +**LLM judges catch softer regressions.** A change might get all the boolean fields right but produce worse rationales — vague, generic, not citing specific evidence. The LLM judge scores quality 1-5 per case. If the mean drops below the threshold, the eval fails even though hard decisions are still correct. -### Makefile parallelism -`make eval-plugins` discovers all `plugins/*/evals/*.yaml` files and runs them in parallel via `$(MAKE) -j`. Each yaml file becomes a sub-make target (with `/` replaced by `__` to work around Make's pattern rule limitation). All eval files run simultaneously — total wall-clock time equals the slowest plugin, not the sum. +**Baseline comparison shows direction.** Running with `--baseline ` compares the new run against a previous one case-by-case. This shows *which* cases got better or worse, not just the aggregate pass rate. -`EVAL_PLUGIN=` narrows the `find` to a single plugin directory. `EVAL_FILTER=` passes `--filter-pattern` to promptfoo, which matches against test `description:` fields within each yaml. `EVAL_TIER=` passes `--filter-metadata tier=` to run only tests at that tier level. +### Workflow for evaluating a plugin change -## CI Workflow +1. **Before the change**: run the eval, note the run-id +2. **Make the change** to the skill or plugin code +3. **Run again** with `--baseline ` +4. **Interpret the results**: + - All thresholds pass, scores same or higher → **improvement or neutral**, ship it + - Thresholds pass but some individual cases regressed → **mixed**, investigate those cases + - Any threshold breached → **regression**, fix before merging -`.github/workflows/eval-plugins.yml` runs on every PR: +### What the gold reference files provide -1. **detect-changed-plugins**: diffs `plugins/` against the base branch, finds plugins with `evals/` directories -2. **behavioral-evals**: matrix job — one per changed plugin, runs `make eval-plugins EVAL_PLUGIN=` -3. Results rendered as GitHub Check via `dorny/test-reporter` and uploaded as JUnit XML artifacts +The `reference-.json` files are the initial known-good output from the first validated run. They serve as: -The workflow requires `ANTHROPIC_VERTEX_PROJECT_ID` and `GOOGLE_APPLICATION_CREDENTIALS` as GitHub secrets. +- The starting point for the first baseline comparison +- A human-readable record of what the skill produced when the eval was created +- A debugging aid when investigating why a specific case started failing -## Adding Evals for a New Plugin +## Per-Plugin Budgets -1. Create `plugins//evals/.yaml` -2. Use an existing eval as a template (e.g., `plugins/hello-world/evals/echo.yaml`) -3. Set the provider to load your plugin: `plugins: [{type: local, path: ../}]` -4. Add `skill-used` in `defaultTest.assert` if testing a skill -5. Add `cost` and `latency` thresholds in `defaultTest.assert` based on observed values (run once, then set 2-3x) -6. Add per-test `metadata` with `token-usage`, `judge-size`, and `tier` — use YAML anchors to DRY -7. For test data, create `plugins//evals/fixtures/` and reference via `file://fixtures/.md` -8. Run locally: `make eval-plugins EVAL_PLUGIN=` +Budgets are defined in `evals/budget.yaml` with `allowed` (admin-set cap) and `current` per plugin. diff --git a/plugins/openshift-developer/.claude-plugin/plugin.json b/plugins/openshift-developer/.claude-plugin/plugin.json index 70387142d..8f471afb4 100644 --- a/plugins/openshift-developer/.claude-plugin/plugin.json +++ b/plugins/openshift-developer/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "openshift-developer", "description": "Bundle of curated plugins, skills, and MCP servers useful to any OpenShift engineer", - "version": "1.1.2", + "version": "1.1.3", "author": { "name": "github.com/openshift-eng" }, diff --git a/plugins/utils/.claude-plugin/plugin.json b/plugins/utils/.claude-plugin/plugin.json index eea6ee6d3..66621d5cb 100644 --- a/plugins/utils/.claude-plugin/plugin.json +++ b/plugins/utils/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "utils", "description": "A generic utilities plugin serving as a catch-all for various helper commands and agents", - "version": "0.0.12", + "version": "0.0.13", "author": { "name": "github.com/openshift-eng" }