From 05d1a12744d985e35fa5e7369ba9e3a0182d1193 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Tue, 16 Jun 2026 23:41:18 -0400 Subject: [PATCH 01/10] =?UTF-8?q?feat(ops):=20skill=20self-improvement=20s?= =?UTF-8?q?ubstrate=20=E2=80=94=20outcome,=20annotations,=20evals?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the qvr-core layer for evidence-gated skill evolution, keeping the binary pure Go (no model calls, no scheduler): - Outcome signal: TOOL/SKILL spans carry qvr.outcome (success|failure| blocked), rolled up to a session verdict; OTLP span status now reflects it instead of a hardcoded OK. Deriver v7->v8 (rederive backfills). Migration 0007. - Human annotations: `qvr audit annotate` / `audit annotations` record a reviewer's verdict in a table that survives rederive (it is durable human input, not a regenerable projection). Migration 0008. - Evals-as-data: evals.yaml manifest + deterministic graders (outcome, text, tool_sequence, tool_constraint, skill_invocation, behavior) graded over the trace qvr already captured — no model calls. internal/eval/. - Eval store + gate: `qvr ops eval run` (exits non-zero on regression), `ops lineage`, `ops promote`, keyed by {skill, commit}. Migration 0009. The seam that keeps core pure: execution stays with the agent; grading, the gate, and lineage are the only parts in the binary, because qvr already has the real lock-verified trace. --- cmd/acceptance_behavioral_test.go | 109 +++++++++ cmd/acceptance_loop_test.go | 213 +++++++++++++++++ cmd/audit.go | 8 +- cmd/audit_annotate.go | 112 +++++++++ cmd/audit_annotate_test.go | 84 +++++++ cmd/audit_annotations.go | 106 +++++++++ cmd/audit_test.go | 6 + cmd/ops.go | 126 ++++++++++ cmd/ops_eval.go | 174 ++++++++++++++ cmd/ops_lineage.go | 148 ++++++++++++ cmd/ops_promote.go | 117 ++++++++++ cmd/ops_test.go | 126 ++++++++++ internal/eval/eval_test.go | 217 ++++++++++++++++++ internal/eval/evidence.go | 95 ++++++++ internal/eval/grade.go | 50 ++++ internal/eval/graders.go | 205 +++++++++++++++++ internal/eval/manifest.go | 157 +++++++++++++ internal/eval/runner.go | 107 +++++++++ internal/ops/derive/claude.go | 4 +- internal/ops/derive/derive.go | 44 +++- internal/ops/derive/outcome_test.go | 156 +++++++++++++ internal/ops/derive/shared.go | 1 + internal/ops/derive/span.go | 75 +++++- internal/ops/rawtrace/capture.go | 1 + internal/ops/store/annotations.go | 118 ++++++++++ internal/ops/store/annotations_test.go | 104 +++++++++ internal/ops/store/eval_runs.go | 177 ++++++++++++++ internal/ops/store/eval_runs_test.go | 66 ++++++ .../migrations/0007_session_meta_outcome.sql | 6 + .../ops/store/migrations/0008_annotations.sql | 19 ++ .../ops/store/migrations/0009_eval_runs.sql | 27 +++ internal/ops/store/scan.go | 8 + internal/ops/store/session_meta.go | 60 +++-- internal/ops/store/session_meta_test.go | 29 +++ internal/ops/store/sqlite.go | 6 + internal/ops/store/store.go | 14 ++ 36 files changed, 3050 insertions(+), 25 deletions(-) create mode 100644 cmd/acceptance_behavioral_test.go create mode 100644 cmd/acceptance_loop_test.go create mode 100644 cmd/audit_annotate.go create mode 100644 cmd/audit_annotate_test.go create mode 100644 cmd/audit_annotations.go create mode 100644 cmd/ops.go create mode 100644 cmd/ops_eval.go create mode 100644 cmd/ops_lineage.go create mode 100644 cmd/ops_promote.go create mode 100644 cmd/ops_test.go create mode 100644 internal/eval/eval_test.go create mode 100644 internal/eval/evidence.go create mode 100644 internal/eval/grade.go create mode 100644 internal/eval/graders.go create mode 100644 internal/eval/manifest.go create mode 100644 internal/eval/runner.go create mode 100644 internal/ops/derive/outcome_test.go create mode 100644 internal/ops/store/annotations.go create mode 100644 internal/ops/store/annotations_test.go create mode 100644 internal/ops/store/eval_runs.go create mode 100644 internal/ops/store/eval_runs_test.go create mode 100644 internal/ops/store/migrations/0007_session_meta_outcome.sql create mode 100644 internal/ops/store/migrations/0008_annotations.sql create mode 100644 internal/ops/store/migrations/0009_eval_runs.sql diff --git a/cmd/acceptance_behavioral_test.go b/cmd/acceptance_behavioral_test.go new file mode 100644 index 0000000..1b6bb9a --- /dev/null +++ b/cmd/acceptance_behavioral_test.go @@ -0,0 +1,109 @@ +package cmd + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/astra-sh/qvr/internal/config" +) + +// TestAcceptance_BehavioralGate exercises the loop on a DIFFERENT dimension than +// the triage label-match: a skill that must verify its edits by running tests. +// The gate here is behavioral — tool sequence, tool constraints, skill +// invocation, efficiency — so it proves the eval substrate generalizes past +// simple text matching. First run edits without testing (fails); the corrected +// run edits THEN tests (passes). +func TestAcceptance_BehavioralGate(t *testing.T) { + isolatedHome(t, true) + cfg, err := config.Load() + if err != nil { + t.Fatalf("load config: %v", err) + } + skillDir := writeGuardTestsFixture(t) + + // 1. A run that edited code but never ran the tests (Read, Edit — no Bash). + badID := seedSession(t, cfg, sessionSeed{ + StartedMs: 1000, Skill: "guard-tests", FinalMsg: "Applied the change.", + Outcome: "success", Tools: []string{"Read", "Edit"}, + }) + + // 2. Reviewer flags the unverified change. + if _, stderr, err := runRoot(t, nil, "audit", "annotate", badID, + "--skill", "guard-tests", "--outcome", "bad", "--note", "shipped without running tests"); err != nil { + t.Fatalf("annotate: err=%v stderr=%q", err, stderr) + } + + // 3. Gate FAILS: tool_sequence Edit→Bash unsatisfied, Bash missing. + if _, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", + "--skill-dir", skillDir, "--output", "json"); err == nil { + t.Fatal("expected the baseline eval to FAIL for an unverified change") + } + + // 4. Corrected run: Read, Edit, THEN Bash (runs the tests). + seedSession(t, cfg, sessionSeed{ + StartedMs: 2000, Skill: "guard-tests", FinalMsg: "Applied the change and ran the tests.", + Outcome: "success", Tools: []string{"Read", "Edit", "Bash"}, + }) + + // 5. Gate PASSES. + out, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", + "--skill-dir", skillDir, "--output", "json") + if err != nil { + t.Fatalf("expected the post-fix eval to PASS, got err=%v", err) + } + var res struct { + Pass bool `json:"pass"` + Failed int `json:"failed"` + } + if e := json.Unmarshal([]byte(out), &res); e != nil { + t.Fatalf("decode eval json: %v\n%s", e, out) + } + if !res.Pass || res.Failed != 0 { + t.Fatalf("post-fix eval = %+v, want pass with 0 failures", res) + } +} + +// writeGuardTestsFixture creates a fixture skill whose suite asserts the skill +// fired, edits were followed by a test run, and no network was used — all +// behavioral graders over the captured trace. +func writeGuardTestsFixture(t *testing.T) string { + t.Helper() + dir := filepath.Join(t.TempDir(), "guard-tests") + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + skill := `--- +name: guard-tests +description: Edits code and always verifies the change by running the test suite. +metadata: + author: quiver-playground + version: "1.0.0" +--- + +# Guard tests + +After editing code, always run the test suite to verify the change before +reporting done. Never ship an edit you have not verified. +` + evals := `version: 1 +suites: + - name: verifies-changes + cases: + - name: edits-then-runs-tests + graders: + - type: skill_invocation + expectSkills: ["guard-tests"] + - type: tool_sequence + sequence: ["Edit", "Bash"] + - type: tool_constraint + expectTools: ["Bash"] + rejectTools: ["WebFetch"] + - type: behavior + maxTools: 10 +` + mustWrite(t, filepath.Join(dir, "SKILL.md"), skill) + mustWrite(t, filepath.Join(dir, "evals.yaml"), evals) + return dir +} diff --git a/cmd/acceptance_loop_test.go b/cmd/acceptance_loop_test.go new file mode 100644 index 0000000..1501187 --- /dev/null +++ b/cmd/acceptance_loop_test.go @@ -0,0 +1,213 @@ +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/ops" + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/google/uuid" +) + +// TestAcceptance_SelfImprovementLoop recreates the article's triage loop end to +// end through the real CLI: a skill mis-triages an issue, a human flags it, the +// eval gate FAILS, the skill is "improved" (a corrected run), and the gate now +// PASSES — with the whole arc visible in `qvr ops lineage`. The fail→pass is the +// branch's acceptance criterion. +func TestAcceptance_SelfImprovementLoop(t *testing.T) { + isolatedHome(t, true) + cfg, err := config.Load() + if err != nil { + t.Fatalf("load config: %v", err) + } + skillDir := writeTriageFixture(t) + + // 1. INNER LOOP: a real run that mis-triaged the issue (the article's + // "ready-to-implement" mistake). Captured + skill-attributed. + badID := seedTriageSession(t, cfg, 1000, "Labeled this issue ready-to-implement.") + + // 2. HUMAN FEEDBACK: the reviewer flips the verdict and says why. + if _, stderr, err := runRoot(t, nil, "audit", "annotate", badID, + "--skill", "triage-issue", "--outcome", "bad", "--note", "ambiguous — needs a setting, should be needs-info"); err != nil { + t.Fatalf("annotate: err=%v stderr=%q", err, stderr) + } + + // 3. BASELINE GATE: the eval must FAIL on the misclassification. + if _, _, err := runRoot(t, nil, "ops", "eval", "run", "triage-issue", + "--skill-dir", skillDir, "--suite", "triage-correctness", "--output", "json"); err == nil { + t.Fatal("expected the baseline eval to FAIL on the misclassified run") + } + + // 4. IMPROVE: the loop edits the skill and re-runs it; the corrected run is + // captured (newer, so it is the most-recent session the eval grades). + seedTriageSession(t, cfg, 2000, "Labeled this issue needs-info pending a decision.") + + // 5. POST-FIX GATE: the same eval must now PASS. + out, _, err := runRoot(t, nil, "ops", "eval", "run", "triage-issue", + "--skill-dir", skillDir, "--suite", "triage-correctness", "--output", "json") + if err != nil { + t.Fatalf("expected the post-fix eval to PASS, got err=%v", err) + } + var res struct { + Pass bool `json:"pass"` + Passed int `json:"passed"` + Failed int `json:"failed"` + } + if e := json.Unmarshal([]byte(out), &res); e != nil { + t.Fatalf("decode eval json: %v\n%s", e, out) + } + if !res.Pass || res.Failed != 0 { + t.Fatalf("post-fix eval = %+v, want pass with 0 failures", res) + } + + // 6. LINEAGE: the timeline shows the fail, then the pass, plus the verdict. + lo, _, err := runRoot(t, nil, "ops", "lineage", "triage-issue", "--output", "json") + if err != nil { + t.Fatalf("lineage: %v", err) + } + assertFailPassArc(t, lo) +} + +// assertFailPassArc checks the lineage timeline carries the article's full arc: +// at least one failed eval, one passed eval, and one human annotation. +func assertFailPassArc(t *testing.T, lineageJSON string) { + t.Helper() + var timeline []struct { + Kind string `json:"kind"` + Pass *bool `json:"pass"` + } + if e := json.Unmarshal([]byte(lineageJSON), &timeline); e != nil { + t.Fatalf("decode lineage json: %v\n%s", e, lineageJSON) + } + evalPass, evalFail, annotations := 0, 0, 0 + for _, e := range timeline { + switch { + case e.Kind == "annotation": + annotations++ + case e.Pass != nil && *e.Pass: + evalPass++ + default: + evalFail++ + } + } + if evalPass < 1 || evalFail < 1 || annotations < 1 { + t.Errorf("lineage missing the fail→pass arc: %d pass, %d fail, %d annotations", evalPass, evalFail, annotations) + } +} + +// writeTriageFixture creates a fixture triage-issue skill with an evals.yaml +// whose correctness suite distinguishes the right label from the wrong one. +func writeTriageFixture(t *testing.T) string { + t.Helper() + dir := filepath.Join(t.TempDir(), "triage-issue") + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + skill := `--- +name: triage-issue +description: Sorts incoming issues into ready-to-implement, duplicate, or needs-info. +metadata: + author: quiver-playground + version: "1.0.0" +--- + +# Triage an issue + +Classify each incoming issue into exactly one bucket: ready-to-implement, +duplicate, or needs-info. When a feature request leaves an ambiguity (e.g. +whether to add a setting), prefer needs-info over ready-to-implement. +` + evals := `version: 1 +suites: + - name: triage-correctness + cases: + - name: ambiguous-feature-needs-info + graders: + - type: outcome + expect: success + - type: text + on: final_message + contains: ["needs-info"] + reject: ["ready-to-implement"] +` + mustWrite(t, filepath.Join(dir, "SKILL.md"), skill) + mustWrite(t, filepath.Join(dir, "evals.yaml"), evals) + return dir +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } +} + +// seedTriageSession seeds a successful triage-issue run ending with finalMsg, +// using a single Bash tool. Thin wrapper over seedSession. +func seedTriageSession(t *testing.T, cfg *config.Config, startedMs int64, finalMsg string) string { + t.Helper() + return seedSession(t, cfg, sessionSeed{ + StartedMs: startedMs, Skill: "triage-issue", FinalMsg: finalMsg, + Outcome: "success", Tools: []string{"Bash"}, + }) +} + +// sessionSeed describes a synthetic captured session to inject straight into the +// audit store (bypassing agent-store discovery), so a test controls exactly the +// evidence the graders read: the skill that fired, the ordered tool calls, the +// session outcome, and the final assistant message. +type sessionSeed struct { + StartedMs int64 + Skill string + FinalMsg string + Outcome string + Tools []string // ordered tool names (TOOL spans), after the SKILL span +} + +// seedSession writes the synthetic session and returns its id. +func seedSession(t *testing.T, cfg *config.Config, seed sessionSeed) string { + t.Helper() + ctx := context.Background() + s, err := store.Open(ctx, store.OpenOptions{Path: ops.DBPath(cfg)}) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer s.Close() + + sid := uuid.New() + outMsgs, _ := json.Marshal([]map[string]string{{"role": "assistant", "content": seed.FinalMsg}}) + tid := "trace-" + sid.String()[:8] + end := seed.StartedMs + 1000 + + spans := []*store.SpanRow{ + {SpanID: tid + "-llm", TraceID: tid, SessionID: sid, AgentName: "claude-code", + Kind: "LLM", Name: "chat", StartMs: seed.StartedMs, EndMs: end, + Attributes: fmt.Sprintf(`{"gen_ai.output.messages":%q}`, string(outMsgs))}, + {SpanID: tid + "-skill", TraceID: tid, SessionID: sid, AgentName: "claude-code", + Kind: "SKILL", Name: "execute_tool Skill", StartMs: seed.StartedMs + 50, EndMs: seed.StartedMs + 100, + Attributes: fmt.Sprintf(`{"gen_ai.tool.name":"Skill","skill.name":%q}`, seed.Skill)}, + } + for i, tool := range seed.Tools { + at := seed.StartedMs + int64(100*(i+2)) + spans = append(spans, &store.SpanRow{ + SpanID: fmt.Sprintf("%s-tool%d", tid, i), TraceID: tid, SessionID: sid, AgentName: "claude-code", + Kind: "TOOL", Name: "execute_tool " + tool, StartMs: at, EndMs: at + 50, + Attributes: fmt.Sprintf(`{"gen_ai.tool.name":%q,"qvr.outcome":"success"}`, tool), + }) + } + meta := &store.SessionMetaRow{ + SessionID: sid, AgentName: "claude-code", Model: "claude-opus-4-8", + Title: seed.Skill + " run", StartedMs: seed.StartedMs, EndedMs: end, + Turns: 1, Tools: int64(len(seed.Tools)), Skills: []string{seed.Skill}, Outcome: seed.Outcome, + DeriverVersion: 8, + } + if err := s.ReplaceSessionDerivation(ctx, meta, spans); err != nil { + t.Fatalf("seed session: %v", err) + } + return sid.String() +} diff --git a/cmd/audit.go b/cmd/audit.go index 0478cde..830453e 100644 --- a/cmd/audit.go +++ b/cmd/audit.go @@ -26,9 +26,11 @@ instantly). Each session's verbatim trace lands in a local SQLite database, attributed to the exact locked skill version that ran. Query it with 'qvr audit logs' / 'qvr audit sessions'. -The everyday surface is enable/disable, discover, status, sessions, logs, and -export. The remaining subcommands (ingest, raw, spans, rederive, gc) are -low-level plumbing the maintenance paths use and are hidden from this list.`, +The everyday surface is enable/disable, discover, status, sessions, logs, +export, and annotate/annotations (record and read human verdicts on what a +skill actually did — the feedback a self-improvement loop reads). The remaining +subcommands (ingest, raw, spans, rederive, gc) are low-level plumbing the +maintenance paths use and are hidden from this list.`, // Reject a typo'd subcommand (`qvr audit enabel`) with a non-zero exit // instead of silently printing help (issue #169 — the #120 fix missed this // parent). No args still prints help. diff --git a/cmd/audit_annotate.go b/cmd/audit_annotate.go new file mode 100644 index 0000000..9386baa --- /dev/null +++ b/cmd/audit_annotate.go @@ -0,0 +1,112 @@ +package cmd + +import ( + "fmt" + "os/user" + "time" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/google/uuid" + "github.com/spf13/cobra" +) + +var ( + annotateOutcome string + annotateNote string + annotateSkill string + annotateAuthor string +) + +var auditAnnotateCmd = &cobra.Command{ + Use: "annotate ", + Short: "Record a human verdict on a captured session", + Long: `Attaches a durable human verdict — the outer-loop feedback signal — to a +captured session: did the skill do the right thing, and why? + +Unlike the auto-derived span outcome (which only reflects whether a tool +errored), an annotation is a reviewer's judgement ("this triage was wrong, the +issue was ambiguous") and is never overwritten by 'qvr audit rederive' — it +lives outside the regenerable spans/session_meta projection. A self-improvement +loop reads these verdicts (alongside the auto outcome) to decide what to fix. + +Scope a verdict to one skill with --skill, or omit it for a whole-session +verdict. Annotations are append-only: a session can accrue several over time.`, + Args: cobra.ExactArgs(1), + RunE: runAuditAnnotate, +} + +func init() { + auditAnnotateCmd.Flags().StringVar(&annotateOutcome, "outcome", "", "the verdict (e.g. good, bad, blocked) — required") + auditAnnotateCmd.Flags().StringVar(&annotateNote, "note", "", "why — the reason for the verdict") + auditAnnotateCmd.Flags().StringVar(&annotateSkill, "skill", "", "scope the verdict to one skill (omit for whole-session)") + auditAnnotateCmd.Flags().StringVar(&annotateAuthor, "author", "", "verdict author (defaults to the current OS user)") + auditCmd.AddCommand(auditAnnotateCmd) +} + +func runAuditAnnotate(cmd *cobra.Command, args []string) error { + if annotateOutcome == "" { + return fmt.Errorf("--outcome is required (e.g. --outcome bad)") + } + id, err := uuid.Parse(args[0]) + if err != nil { + return fmt.Errorf("invalid session id %q: %w", args[0], err) + } + + cfg, err := config.Load() + if err != nil { + return err + } + if !auditDBExists(cfg) { + return fmt.Errorf("no audit database yet — run `qvr audit discover` first") + } + + // A verdict is a write, so open read-write (this also applies migration + // 0008 if the DB predates the annotations table). + s, err := openAuditStore(cmd.Context(), cfg, false) + if err != nil { + return fmt.Errorf("open audit store: %w", err) + } + defer s.Close() + + meta, err := s.GetSessionMeta(cmd.Context(), id) + if err != nil { + return fmt.Errorf("look up session: %w", err) + } + if meta == nil { + return fmt.Errorf("no captured session %s — run `qvr audit sessions` to list ids", id) + } + + a := &store.AnnotationRow{ + SessionID: id, + Skill: annotateSkill, + Outcome: annotateOutcome, + Note: annotateNote, + Author: annotateAuthor, + CreatedAt: time.Now().UTC(), + } + if a.Author == "" { + a.Author = currentUserName() + } + if err := s.PutAnnotation(cmd.Context(), a); err != nil { + return err + } + + if outputFormat == "json" { + return printer.JSON(a) + } + scope := "session" + if a.Skill != "" { + scope = "skill " + a.Skill + } + printer.Info(fmt.Sprintf("Recorded %q verdict on %s (%s)", a.Outcome, scope, id)) + return nil +} + +// currentUserName returns the OS username for verdict provenance, or "unknown". +func currentUserName() string { + if u, err := user.Current(); err == nil && u.Username != "" { + return u.Username + } + return "unknown" +} diff --git a/cmd/audit_annotate_test.go b/cmd/audit_annotate_test.go new file mode 100644 index 0000000..a366955 --- /dev/null +++ b/cmd/audit_annotate_test.go @@ -0,0 +1,84 @@ +package cmd + +import ( + "encoding/json" + "strings" + "testing" +) + +// TestAudit_AnnotateRoundTrip drives the full CLI: discover a seeded session, +// record a human verdict on it, then read it back via `audit annotations`. +func TestAudit_AnnotateRoundTrip(t *testing.T) { + home, _ := isolatedHome(t, true) + t.Setenv("HOME", home) + seedClaudeStore(t, home) + + if _, stderr, err := runRoot(t, nil, "audit", "discover", "--agent", "claude", "--output", "json"); err != nil { + t.Fatalf("discover: err=%v stderr=%q", err, stderr) + } + + // Grab the discovered session id. + sessionsOut, _, err := runRoot(t, nil, "audit", "sessions", "--output", "json") + if err != nil { + t.Fatalf("sessions: %v", err) + } + var sessions []struct { + SessionID string `json:"session_id"` + } + if e := json.Unmarshal([]byte(sessionsOut), &sessions); e != nil || len(sessions) == 0 { + t.Fatalf("decode sessions: %v\n%s", e, sessionsOut) + } + sid := sessions[0].SessionID + + // Record a per-skill verdict. + out, stderr, err := runRoot(t, nil, "audit", "annotate", sid, + "--skill", "code-review", "--outcome", "bad", "--note", "missed the bug", "--output", "json") + if err != nil { + t.Fatalf("annotate: err=%v stderr=%q", err, stderr) + } + var wrote struct { + Outcome string `json:"outcome"` + Skill string `json:"skill"` + Author string `json:"author"` + } + if e := json.Unmarshal([]byte(out), &wrote); e != nil { + t.Fatalf("decode annotate json: %v\n%s", e, out) + } + if wrote.Outcome != "bad" || wrote.Skill != "code-review" || wrote.Author == "" { + t.Fatalf("annotate echo = %+v, want outcome=bad skill=code-review author!=\"\"", wrote) + } + + // Read it back. + listOut, _, err := runRoot(t, nil, "audit", "annotations", "--skill", "code-review", "--output", "json") + if err != nil { + t.Fatalf("annotations list: %v", err) + } + if !strings.Contains(listOut, "missed the bug") || !strings.Contains(listOut, sid) { + t.Errorf("annotation not listed: %s", listOut) + } +} + +// TestAudit_AnnotateRequiresOutcome pins the required --outcome flag. +func TestAudit_AnnotateRequiresOutcome(t *testing.T) { + home, _ := isolatedHome(t, true) + t.Setenv("HOME", home) + if _, _, err := runRoot(t, nil, "audit", "annotate", "00000000-0000-0000-0000-000000000000"); err == nil { + t.Error("expected an error when --outcome is omitted") + } +} + +// TestAudit_AnnotateUnknownSession rejects a verdict on a session that was +// never captured. +func TestAudit_AnnotateUnknownSession(t *testing.T) { + home, _ := isolatedHome(t, true) + t.Setenv("HOME", home) + seedClaudeStore(t, home) + if _, _, err := runRoot(t, nil, "audit", "discover", "--agent", "claude"); err != nil { + t.Fatalf("discover: %v", err) + } + _, _, err := runRoot(t, nil, "audit", "annotate", + "11111111-1111-1111-1111-111111111111", "--outcome", "bad") + if err == nil { + t.Error("expected an error annotating an unknown session") + } +} diff --git a/cmd/audit_annotations.go b/cmd/audit_annotations.go new file mode 100644 index 0000000..e06a2ab --- /dev/null +++ b/cmd/audit_annotations.go @@ -0,0 +1,106 @@ +package cmd + +import ( + "fmt" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/google/uuid" + "github.com/spf13/cobra" +) + +var ( + annotationsSession string + annotationsSkill string + annotationsSince string +) + +var auditAnnotationsCmd = &cobra.Command{ + Use: "annotations", + Short: "List human verdicts recorded on captured sessions", + Long: `Lists the human verdicts recorded with 'qvr audit annotate', newest-first. +These are the outer-loop feedback signal a self-improvement loop reads to decide +what to fix. Scope with --session and/or --skill.`, + Args: cobra.NoArgs, + RunE: runAuditAnnotations, +} + +func init() { + auditAnnotationsCmd.Flags().StringVar(&annotationsSession, "session", "", "filter to one session id") + auditAnnotationsCmd.Flags().StringVar(&annotationsSkill, "skill", "", "filter to one skill") + auditAnnotationsCmd.Flags().StringVar(&annotationsSince, "since", "", "only verdicts since this time (e.g. 7d, 24h, or RFC3339)") + auditCmd.AddCommand(auditAnnotationsCmd) +} + +func runAuditAnnotations(cmd *cobra.Command, args []string) error { + cfg, err := config.Load() + if err != nil { + return err + } + if !auditDBExists(cfg) { + if outputFormat == "json" { + return printer.JSON([]any{}) + } + printer.Info("No annotations recorded yet") + return nil + } + + f := &store.AnnotationFilter{Skill: annotationsSkill} + if annotationsSession != "" { + id, perr := uuid.Parse(annotationsSession) + if perr != nil { + return fmt.Errorf("invalid --session id %q: %w", annotationsSession, perr) + } + f.SessionID = &id + } + if annotationsSince != "" { + t, perr := parseTimeFlag(annotationsSince) + if perr != nil { + return fmt.Errorf("invalid --since: %w", perr) + } + f.Since = &t + } + + s, err := openAuditStore(cmd.Context(), cfg, true) + if err != nil { + return fmt.Errorf("open audit store: %w", err) + } + defer s.Close() + + rows, err := s.ListAnnotations(cmd.Context(), f) + if err != nil { + return fmt.Errorf("list annotations: %w", err) + } + return renderAnnotations(rows) +} + +func renderAnnotations(rows []*store.AnnotationRow) error { + if outputFormat == "json" { + if len(rows) == 0 { + return printer.JSON([]any{}) + } + return printer.JSON(rows) + } + if len(rows) == 0 { + printer.Info("No annotations recorded yet") + return nil + } + headers := []string{"CREATED", "OUTCOME", "SKILL", "AUTHOR", "NOTE", "SESSION ID"} + out := make([][]string, 0, len(rows)) + for _, a := range rows { + skill := a.Skill + if skill == "" { + skill = "(session)" + } + out = append(out, []string{ + a.CreatedAt.Local().Format("01-02 15:04"), + a.Outcome, + skill, + a.Author, + clipCell(a.Note, 48), + a.SessionID.String(), + }) + } + printer.Table(headers, out) + return nil +} diff --git a/cmd/audit_test.go b/cmd/audit_test.go index b5a4ed2..e990041 100644 --- a/cmd/audit_test.go +++ b/cmd/audit_test.go @@ -99,6 +99,12 @@ func runRoot(t *testing.T, stdin []byte, args ...string) (string, string, error) ingestAgent, ingestSession, ingestCwd = "", "", "" discoverAgents, discoverSince = nil, "" discoverKeepAll, discoverDryRun = false, false + // SkillOps + annotation command flags (same persistence hazard). + evalSkillDir, evalSuite, evalSession = "", "", "" + annotateOutcome, annotateNote, annotateSkill, annotateAuthor = "", "", "", "" + annotationsSession, annotationsSkill, annotationsSince = "", "", "" + promoteReason, promoteSkillDir, lineageSince = "", "", "" + promoteForce = false }) return stdout, stderr, err } diff --git a/cmd/ops.go b/cmd/ops.go new file mode 100644 index 0000000..49784c4 --- /dev/null +++ b/cmd/ops.go @@ -0,0 +1,126 @@ +package cmd + +import ( + "fmt" + "os" + "path/filepath" + "sort" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/eval" + "github.com/astra-sh/qvr/internal/model" + "github.com/spf13/cobra" +) + +// opsCmd is the parent for SkillOps — evaluating skills as first-class +// artifacts and gating their evolution on evidence. It builds on the audit +// trace layer: `qvr ops eval` grades a skill's captured sessions against its +// evals.yaml, and `qvr ops lineage` joins those verdicts to the spans by the +// exact locked commit that ran. +var opsCmd = &cobra.Command{ + Use: "ops", + Short: "[experimental] Evaluate skills and gate their evolution on evidence", + Long: `[EXPERIMENTAL] SkillOps treats a skill as a first-class artifact with an +evals.yaml manifest (a sibling of SKILL.md). It grades the skill's CAPTURED +sessions deterministically — pure Go over the spans qvr already derived, no +model calls, no agent execution — and keys every verdict to the exact locked +commit that ran. That is the evidence a self-improvement loop gates on: improve +the skill, re-run it, and 'qvr ops eval' must show the score improve before the +change ships. + +Subcommands: eval (grade a skill), lineage (verdicts over commits), promote +(refuse to advance a skill without a passing eval).`, + RunE: rejectUnknownSubcommand, +} + +func init() { + rootCmd.AddCommand(opsCmd) +} + +// resolvedSkill carries everything the ops subcommands need about a target +// skill: where its evals.yaml/SKILL.md live, and the locked commit that pins +// every eval verdict (the lock × evidence join). Commit is "" for a skill with +// no project lock entry (e.g. a local fixture not yet committed). +type resolvedSkill struct { + Name string + Dir string + Commit string +} + +// resolveSkill locates a skill's directory and locked commit. An explicit +// --skill-dir wins (the evolve loop points it at the ejected, writable copy); +// otherwise the installed project target dir is searched. The commit is read +// from the project lock when present. +func resolveSkill(name, skillDirFlag string) (*resolvedSkill, error) { + projectRoot, err := os.Getwd() + if err != nil { + return nil, fmt.Errorf("resolve cwd: %w", err) + } + out := &resolvedSkill{Name: name} + + if skillDirFlag != "" { + out.Dir = skillDirFlag + } else { + dir, ok := installedSkillDir(projectRoot, name) + if !ok { + return nil, fmt.Errorf("could not find skill %q under any target dir — pass --skill-dir ", name) + } + out.Dir = dir + } + if _, err := os.Stat(filepath.Join(out.Dir, "SKILL.md")); err != nil { + return nil, fmt.Errorf("%s has no SKILL.md: %w", out.Dir, err) + } + out.Commit = lockedCommit(projectRoot, name) + return out, nil +} + +// installedSkillDir searches the distinct project-relative target dirs for an +// installed skill named name, returning the first that holds a SKILL.md. The +// dirs are searched in sorted order so resolution is deterministic. +func installedSkillDir(projectRoot, name string) (string, bool) { + seen := map[string]bool{} + var dirs []string + for _, t := range model.Targets { + if !seen[t.LocalDir] { + seen[t.LocalDir] = true + dirs = append(dirs, t.LocalDir) + } + } + sort.Strings(dirs) + for _, d := range dirs { + cand := filepath.Join(projectRoot, d, name) + if _, err := os.Stat(filepath.Join(cand, "SKILL.md")); err == nil { + return cand, true + } + } + return "", false +} + +// lockedCommit reads the project lock for the skill's pinned commit, or "" when +// there is no lock/entry (best-effort: a missing commit just leaves the eval +// run keyed by an empty commit, still time-ordered for lineage). +func lockedCommit(projectRoot, name string) string { + lockPath := model.DefaultLockPath(projectRoot, config.Dir(), false) + lock, err := model.ReadLockFile(lockPath) + if err != nil { + return "" + } + if e, ok := lock.Skills[name]; ok { + return e.Commit + } + return "" +} + +// loadEvalSuiteNames is a small helper for diagnostics: the suite names a +// skill's manifest defines, or nil. +func loadEvalSuiteNames(dir string) []string { + man, err := eval.Load(dir) + if err != nil || man == nil { + return nil + } + names := make([]string, 0, len(man.Suites)) + for _, s := range man.Suites { + names = append(names, s.Name) + } + return names +} diff --git a/cmd/ops_eval.go b/cmd/ops_eval.go new file mode 100644 index 0000000..1d1c04b --- /dev/null +++ b/cmd/ops_eval.go @@ -0,0 +1,174 @@ +package cmd + +import ( + "fmt" + "strings" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/eval" + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/google/uuid" + "github.com/spf13/cobra" +) + +var ( + evalSkillDir string + evalSuite string + evalSession string +) + +var opsEvalCmd = &cobra.Command{ + Use: "eval", + Short: "Grade a skill's captured sessions against its evals.yaml", + Long: `Grades a skill against its evals.yaml manifest, deterministically, over the +sessions qvr already captured — no agent run, no model call. The verdict is +recorded keyed by the skill's locked commit, so 'qvr ops lineage' can show the +score move across versions.`, + RunE: rejectUnknownSubcommand, +} + +var opsEvalRunCmd = &cobra.Command{ + Use: "run ", + Short: "Run a skill's evals against its latest captured session", + Long: `Grades the skill's most recent captured session (or --session ) against +the suite(s) in its evals.yaml, records the result keyed by the locked commit, +and exits non-zero if any case fails — so a CI step can gate a change. + +By default the installed project skill dir is used; pass --skill-dir to evaluate +a specific directory (e.g. the writable copy an evolution loop is editing).`, + Args: cobra.ExactArgs(1), + RunE: runOpsEval, +} + +func init() { + opsEvalRunCmd.Flags().StringVar(&evalSkillDir, "skill-dir", "", "skill directory holding evals.yaml (default: the installed project skill)") + opsEvalRunCmd.Flags().StringVar(&evalSuite, "suite", "", "evaluate only this suite (default: all)") + opsEvalRunCmd.Flags().StringVar(&evalSession, "session", "", "grade this session id (default: the skill's most recent)") + opsEvalCmd.AddCommand(opsEvalRunCmd) + opsCmd.AddCommand(opsEvalCmd) +} + +func runOpsEval(cmd *cobra.Command, args []string) error { + rs, err := resolveSkill(args[0], evalSkillDir) + if err != nil { + return err + } + + cfg, err := config.Load() + if err != nil { + return err + } + if !auditDBExists(cfg) { + return fmt.Errorf("no audit database yet — run `qvr audit discover` first") + } + // Eval writes an eval_runs row, so open read-write (applies migration 0009). + s, err := openAuditStore(cmd.Context(), cfg, false) + if err != nil { + return fmt.Errorf("open audit store: %w", err) + } + defer s.Close() + + in := eval.RunInput{SkillName: rs.Name, SkillDir: rs.Dir, Suite: evalSuite} + if evalSession != "" { + id, perr := uuid.Parse(evalSession) + if perr != nil { + return fmt.Errorf("invalid --session id %q: %w", evalSession, perr) + } + in.SessionID = &id + } + + result, err := eval.Run(cmd.Context(), s, in) + if err != nil { + if names := loadEvalSuiteNames(rs.Dir); evalSuite != "" && len(names) > 0 { + return fmt.Errorf("%w (available suites: %s)", err, strings.Join(names, ", ")) + } + return err + } + + if _, err := s.PutEvalRun(cmd.Context(), evalRunRow(rs, result)); err != nil { + return fmt.Errorf("record eval run: %w", err) + } + + if rerr := renderEvalResult(result); rerr != nil { + return rerr + } + // A failed gate exits non-zero so CI / the evolution loop can branch on it. + if !result.Pass { + return fmt.Errorf("eval failed: %d of %d cases failed for %s", result.Failed, result.Passed+result.Failed, rs.Name) + } + return nil +} + +// evalRunRow flattens a run result into the persisted row + its case rows. +func evalRunRow(rs *resolvedSkill, r *eval.RunResult) *store.EvalRunRow { + suite := evalSuite + if suite == "" { + suite = "*" + } + row := &store.EvalRunRow{ + SkillName: rs.Name, + SkillCommit: rs.Commit, + Suite: suite, + SessionID: r.SessionID, + Passed: r.Passed, + Failed: r.Failed, + Pass: r.Pass, + } + for _, su := range r.Suites { + for _, c := range su.Cases { + row.Cases = append(row.Cases, store.EvalCaseRow{ + Suite: su.Suite, Case: c.Case, Pass: c.Pass, Detail: caseDetail(c), + }) + } + } + return row +} + +// caseDetail joins the failing graders' details into a one-line reason. +func caseDetail(c eval.CaseResult) string { + if c.Pass { + return "" + } + var parts []string + for _, g := range c.Graders { + if !g.Pass { + parts = append(parts, fmt.Sprintf("%s: %s", g.Type, g.Detail)) + } + } + return strings.Join(parts, "; ") +} + +func renderEvalResult(r *eval.RunResult) error { + if outputFormat == "json" { + return printer.JSON(r) + } + verdict := "PASS" + if !r.Pass { + verdict = "FAIL" + } + printer.Info(fmt.Sprintf("%s — %s: %d passed, %d failed (session %s)", + verdict, r.Skill, r.Passed, r.Failed, shortID(r.SessionID))) + headers := []string{"SUITE", "CASE", "RESULT", "DETAIL"} + rows := make([][]string, 0) + for _, su := range r.Suites { + for _, c := range su.Cases { + res := "pass" + if !c.Pass { + res = "FAIL" + } + rows = append(rows, []string{su.Suite, c.Case, res, clipCell(caseDetail(c), 60)}) + } + } + if len(rows) > 0 { + printer.Table(headers, rows) + } + return nil +} + +// shortID clips a session id to its first segment for compact display. +func shortID(id string) string { + if len(id) >= 8 { + return id[:8] + } + return id +} diff --git a/cmd/ops_lineage.go b/cmd/ops_lineage.go new file mode 100644 index 0000000..f99d1b1 --- /dev/null +++ b/cmd/ops_lineage.go @@ -0,0 +1,148 @@ +package cmd + +import ( + "fmt" + "sort" + "time" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/spf13/cobra" +) + +var lineageSince string + +var opsLineageCmd = &cobra.Command{ + Use: "lineage ", + Short: "Show a skill's eval verdicts and human annotations over time", + Long: `Merges a skill's eval runs and human annotations into one time-ordered +timeline, each eval keyed to the locked commit that was graded. This is how the +self-improvement loop reads "at this commit the suite failed; after the fix it +passed" — the lock × evidence join made legible.`, + Args: cobra.ExactArgs(1), + RunE: runOpsLineage, +} + +func init() { + opsLineageCmd.Flags().StringVar(&lineageSince, "since", "", "only entries since this time (e.g. 30d, 24h, or RFC3339)") + opsCmd.AddCommand(opsLineageCmd) +} + +// lineageEntry is one row of the merged timeline. +type lineageEntry struct { + At time.Time `json:"at"` + Kind string `json:"kind"` // "eval" or "annotation" + Commit string `json:"commit,omitempty"` + Detail string `json:"detail"` + Pass *bool `json:"pass,omitempty"` + Outcome string `json:"outcome,omitempty"` +} + +func runOpsLineage(cmd *cobra.Command, args []string) error { + name := args[0] + cfg, err := config.Load() + if err != nil { + return err + } + if !auditDBExists(cfg) { + if outputFormat == "json" { + return printer.JSON([]any{}) + } + printer.Info("Nothing recorded yet") + return nil + } + s, err := openAuditStore(cmd.Context(), cfg, true) + if err != nil { + return fmt.Errorf("open audit store: %w", err) + } + defer s.Close() + + var since *time.Time + if lineageSince != "" { + t, perr := parseTimeFlag(lineageSince) + if perr != nil { + return fmt.Errorf("invalid --since: %w", perr) + } + since = &t + } + + runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: name, Since: since}) + if err != nil { + return fmt.Errorf("list eval runs: %w", err) + } + anns, err := s.ListAnnotations(cmd.Context(), &store.AnnotationFilter{Skill: name, Since: since}) + if err != nil { + return fmt.Errorf("list annotations: %w", err) + } + + timeline := mergeLineage(runs, anns) + return renderLineage(timeline) +} + +// mergeLineage interleaves eval runs and annotations into one newest-first +// timeline. +func mergeLineage(runs []*store.EvalRunRow, anns []*store.AnnotationRow) []lineageEntry { + var out []lineageEntry + for _, r := range runs { + pass := r.Pass + out = append(out, lineageEntry{ + At: r.StartedAt, Kind: "eval", Commit: shortCommit(r.SkillCommit), Pass: &pass, + Detail: fmt.Sprintf("suite %s: %d passed, %d failed", r.Suite, r.Passed, r.Failed), + }) + } + for _, a := range anns { + out = append(out, lineageEntry{ + At: a.CreatedAt, Kind: "annotation", Outcome: a.Outcome, + Detail: annotationDetail(a), + }) + } + sort.SliceStable(out, func(i, j int) bool { return out[i].At.After(out[j].At) }) + return out +} + +func annotationDetail(a *store.AnnotationRow) string { + if a.Note != "" { + return fmt.Sprintf("%s — %s", a.Outcome, a.Note) + } + return a.Outcome +} + +func renderLineage(timeline []lineageEntry) error { + if outputFormat == "json" { + if len(timeline) == 0 { + return printer.JSON([]any{}) + } + return printer.JSON(timeline) + } + if len(timeline) == 0 { + printer.Info("No eval runs or annotations for that skill yet") + return nil + } + headers := []string{"WHEN", "KIND", "COMMIT", "RESULT", "DETAIL"} + rows := make([][]string, 0, len(timeline)) + for _, e := range timeline { + result := "" + switch { + case e.Pass != nil && *e.Pass: + result = "pass" + case e.Pass != nil: + result = "FAIL" + case e.Outcome != "": + result = e.Outcome + } + rows = append(rows, []string{ + e.At.Local().Format("01-02 15:04"), + e.Kind, e.Commit, result, clipCell(e.Detail, 56), + }) + } + printer.Table(headers, rows) + return nil +} + +// shortCommit clips a git SHA to 7 chars, leaving "" (no lock) untouched. +func shortCommit(c string) string { + if len(c) >= 7 { + return c[:7] + } + return c +} diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go new file mode 100644 index 0000000..97eca56 --- /dev/null +++ b/cmd/ops_promote.go @@ -0,0 +1,117 @@ +package cmd + +import ( + "fmt" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/spf13/cobra" +) + +var ( + promoteReason string + promoteForce bool + promoteSkillDir string +) + +var opsPromoteCmd = &cobra.Command{ + Use: "promote ", + Short: "Gate a skill's advancement on a passing eval", + Long: `Refuses to advance a skill whose currently-locked commit has no passing eval +run, unless --force-no-eval is given. This is the evidence gate the +self-improvement loop ends on: a drafted improvement only ships once +'qvr ops eval run' has recorded a pass for that exact commit. + +It is a check, not a state change: it reports whether the locked commit is +backed by evidence, so a loop or CI step can branch on the exit code.`, + Args: cobra.ExactArgs(1), + RunE: runOpsPromote, +} + +func init() { + opsPromoteCmd.Flags().StringVar(&promoteReason, "reason", "", "why the skill is being promoted (recorded in the message)") + opsPromoteCmd.Flags().BoolVar(&promoteForce, "force-no-eval", false, "promote even without a passing eval (records the override)") + opsPromoteCmd.Flags().StringVar(&promoteSkillDir, "skill-dir", "", "skill directory (default: the installed project skill)") + opsCmd.AddCommand(opsPromoteCmd) +} + +func runOpsPromote(cmd *cobra.Command, args []string) error { + rs, err := resolveSkill(args[0], promoteSkillDir) + if err != nil { + return err + } + + cfg, err := config.Load() + if err != nil { + return err + } + if !auditDBExists(cfg) { + return fmt.Errorf("no audit database yet — run `qvr audit discover` first") + } + s, err := openAuditStore(cmd.Context(), cfg, true) + if err != nil { + return fmt.Errorf("open audit store: %w", err) + } + defer s.Close() + + passing := latestPassingEval(cmd, s, rs) + decision := promoteDecision(rs, passing) + + if outputFormat == "json" { + if err := printer.JSON(decision); err != nil { + return err + } + } else { + printer.Info(decision.Message) + } + if !decision.Promoted { + return fmt.Errorf("refusing to promote %s: no passing eval for commit %s (run `qvr ops eval run %s`, or pass --force-no-eval)", + rs.Name, shortCommit(rs.Commit), rs.Name) + } + return nil +} + +// promoteDecisionResult is the JSON/text shape of a promote check. +type promoteDecisionResult struct { + Skill string `json:"skill"` + Commit string `json:"commit,omitempty"` + Promoted bool `json:"promoted"` + Forced bool `json:"forced,omitempty"` + Reason string `json:"reason,omitempty"` + Message string `json:"message"` +} + +func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow) promoteDecisionResult { + d := promoteDecisionResult{Skill: rs.Name, Commit: shortCommit(rs.Commit), Reason: promoteReason} + switch { + case passing != nil: + d.Promoted = true + d.Message = fmt.Sprintf("%s @ %s is backed by a passing eval (run #%d) — clear to promote", rs.Name, shortCommit(rs.Commit), passing.ID) + case promoteForce: + d.Promoted = true + d.Forced = true + d.Message = fmt.Sprintf("%s @ %s promoted WITHOUT a passing eval (--force-no-eval)", rs.Name, shortCommit(rs.Commit)) + default: + d.Message = fmt.Sprintf("%s @ %s has no passing eval", rs.Name, shortCommit(rs.Commit)) + } + return d +} + +// latestPassingEval returns the newest passing eval run for the skill's locked +// commit, or nil. A skill with no locked commit ("") can't be evidence-gated by +// commit, so this returns nil (promotion then requires --force-no-eval). +func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) *store.EvalRunRow { + if rs.Commit == "" { + return nil + } + runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit}) + if err != nil { + return nil + } + for _, r := range runs { // newest-first + if r.Pass { + return r + } + } + return nil +} diff --git a/cmd/ops_test.go b/cmd/ops_test.go new file mode 100644 index 0000000..fb37894 --- /dev/null +++ b/cmd/ops_test.go @@ -0,0 +1,126 @@ +package cmd + +import ( + "encoding/json" + "strings" + "testing" + + "github.com/astra-sh/qvr/internal/config" + "github.com/astra-sh/qvr/internal/ops/store" +) + +// TestOpsEval_NoDatabase pins the helpful error when nothing has been captured. +func TestOpsEval_NoDatabase(t *testing.T) { + isolatedHome(t, true) + dir := writeGuardTestsFixture(t) + _, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", "--skill-dir", dir) + if err == nil || !strings.Contains(err.Error(), "audit database") { + t.Errorf("want a 'no audit database' error, got %v", err) + } +} + +// TestOpsEval_UnknownSkillDir rejects a --skill-dir with no SKILL.md. +func TestOpsEval_UnknownSkillDir(t *testing.T) { + isolatedHome(t, true) + _, _, err := runRoot(t, nil, "ops", "eval", "run", "ghost", "--skill-dir", t.TempDir()) + if err == nil || !strings.Contains(err.Error(), "SKILL.md") { + t.Errorf("want a missing-SKILL.md error, got %v", err) + } +} + +// TestOpsEval_UnknownSuite reports the available suites. +func TestOpsEval_UnknownSuite(t *testing.T) { + isolatedHome(t, true) + cfg, _ := config.Load() + dir := writeGuardTestsFixture(t) + seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "guard-tests", Outcome: "success", Tools: []string{"Bash"}}) + + _, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", "--skill-dir", dir, "--suite", "nope") + if err == nil || !strings.Contains(err.Error(), "verifies-changes") { + t.Errorf("want an error listing available suites, got %v", err) + } +} + +// TestOpsEval_NoSessions errors helpfully when the skill has no captured run. +func TestOpsEval_NoSessions(t *testing.T) { + isolatedHome(t, true) + cfg, _ := config.Load() + dir := writeGuardTestsFixture(t) + // Create the DB (a session for a DIFFERENT skill) so the DB exists but the + // target skill has no runs. + seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "other-skill", Outcome: "success", Tools: []string{"Bash"}}) + + _, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", "--skill-dir", dir) + if err == nil || !strings.Contains(err.Error(), "no captured sessions") { + t.Errorf("want a 'no captured sessions' error, got %v", err) + } +} + +// TestOpsPromote_Gate proves the evidence gate: without a passing eval the +// promote refuses (non-zero), and --force-no-eval overrides it. +func TestOpsPromote_Gate(t *testing.T) { + isolatedHome(t, true) + cfg, _ := config.Load() + dir := writeGuardTestsFixture(t) + seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "guard-tests", Outcome: "success", Tools: []string{"Bash"}}) + + // No eval recorded yet → refuse. + if _, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir); err == nil { + t.Error("expected promote to refuse without a passing eval") + } + + // Forced → allowed, and the JSON records the override. + out, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir, "--force-no-eval", "--output", "json") + if err != nil { + t.Fatalf("forced promote: %v", err) + } + var d struct { + Promoted bool `json:"promoted"` + Forced bool `json:"forced"` + } + if e := json.Unmarshal([]byte(out), &d); e != nil { + t.Fatalf("decode: %v\n%s", e, out) + } + if !d.Promoted || !d.Forced { + t.Errorf("forced promote = %+v, want promoted+forced", d) + } +} + +// TestPromoteDecision unit-tests the pure gate logic across its three branches. +func TestPromoteDecision(t *testing.T) { + rs := &resolvedSkill{Name: "guard-tests", Commit: "abc1234"} + + t.Run("passing eval clears it", func(t *testing.T) { + promoteForce = false + d := promoteDecision(rs, &store.EvalRunRow{ID: 7, Pass: true}) + if !d.Promoted || d.Forced { + t.Errorf("want promoted, not forced: %+v", d) + } + }) + t.Run("no eval refuses", func(t *testing.T) { + promoteForce = false + if d := promoteDecision(rs, nil); d.Promoted { + t.Errorf("want refusal, got %+v", d) + } + }) + t.Run("force overrides", func(t *testing.T) { + promoteForce = true + d := promoteDecision(rs, nil) + if !d.Promoted || !d.Forced { + t.Errorf("want forced promote: %+v", d) + } + promoteForce = false + }) +} + +// TestOpsLineage_Empty pins the no-data path: empty JSON, exit 0. +func TestOpsLineage_Empty(t *testing.T) { + isolatedHome(t, true) + out, _, err := runRoot(t, nil, "ops", "lineage", "guard-tests", "--output", "json") + if err != nil { + t.Fatalf("lineage on empty: %v", err) + } + if strings.TrimSpace(out) != "[]" { + t.Errorf("want empty array, got %q", out) + } +} diff --git a/internal/eval/eval_test.go b/internal/eval/eval_test.go new file mode 100644 index 0000000..489a590 --- /dev/null +++ b/internal/eval/eval_test.go @@ -0,0 +1,217 @@ +package eval + +import ( + "os" + "path/filepath" + "testing" + + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/google/uuid" +) + +// TestParse_RejectsNoOpGraders pins the hardening rule: a grader with no real +// assertion (a gate that can never fail) is a config error, not a silent pass. +func TestParse_RejectsNoOpGraders(t *testing.T) { + cases := []struct { + name string + grader string + wantErr bool + }{ + {"outcome ok", "type: outcome\n expect: success", false}, + {"outcome no expect", "type: outcome", true}, + {"text ok", "type: text\n contains: [\"x\"]", false}, + {"text empty", "type: text", true}, + {"sequence empty", "type: tool_sequence", true}, + {"constraint empty", "type: tool_constraint", true}, + {"constraint maxTools ok", "type: tool_constraint\n maxTools: 5", false}, + {"skill_invocation empty", "type: skill_invocation", true}, + {"behavior empty", "type: behavior", true}, + {"behavior maxTurns ok", "type: behavior\n maxTurns: 3", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + y := "version: 1\nsuites:\n - name: s\n cases:\n - name: c\n graders:\n - " + tc.grader + "\n" + _, err := Parse([]byte(y)) + if (err != nil) != tc.wantErr { + t.Errorf("Parse err=%v, wantErr=%v\nyaml:\n%s", err, tc.wantErr, y) + } + }) + } +} + +// TestLoad_AbsentManifestIsNil pins the contract: a skill with no evals.yaml is +// not an error (it just has no evals), while malformed YAML is. +func TestLoad_AbsentManifestIsNil(t *testing.T) { + dir := t.TempDir() + m, err := Load(dir) + if err != nil || m != nil { + t.Errorf("absent manifest: got (%v, %v), want (nil, nil)", m, err) + } + + if err := os.WriteFile(filepath.Join(dir, ManifestFile), []byte("version: 1\nsuites: [oops\n"), 0o644); err != nil { + t.Fatal(err) + } + if _, err := Load(dir); err == nil { + t.Error("expected an error for malformed evals.yaml") + } +} + +// TestBuildEvidence distils a captured session into grader input: the final +// assistant message and the tool sequence (ordered by start time, even when the +// store hands spans back out of order). +func TestBuildEvidence(t *testing.T) { + sid := uuid.New() + meta := &store.SessionMetaRow{ + SessionID: sid, Outcome: "success", Skills: []string{"triage-issue"}, + Tools: 2, Turns: 1, StartedMs: 1000, EndedMs: 2500, + } + // Spans deliberately out of order; the second tool starts later. + spans := []*store.SpanRow{ + {Kind: "TOOL", StartMs: 1200, Attributes: `{"gen_ai.tool.name":"Edit"}`}, + {Kind: "LLM", StartMs: 1000, Attributes: `{"gen_ai.output.messages":"[{\"role\":\"assistant\",\"content\":\"Labeled needs-info\"}]"}`}, + {Kind: "TOOL", StartMs: 1100, Attributes: `{"gen_ai.tool.name":"Read"}`}, + } + e := BuildEvidence(meta, spans) + if e.Outcome != "success" || e.FinalMessage != "Labeled needs-info" { + t.Errorf("evidence basics wrong: %+v", e) + } + if len(e.ToolSequence) != 2 || e.ToolSequence[0] != "Read" || e.ToolSequence[1] != "Edit" { + t.Errorf("tool sequence not ordered by start time: %v", e.ToolSequence) + } + if e.DurationMs != 1500 { + t.Errorf("duration = %d, want 1500", e.DurationMs) + } +} + +// TestBuildEvidence_ToleratesGarbage pins robustness: malformed or empty span +// attribute blobs must degrade gracefully (empty fields), never panic. +func TestBuildEvidence_ToleratesGarbage(t *testing.T) { + meta := &store.SessionMetaRow{SessionID: uuid.New()} + spans := []*store.SpanRow{ + {Kind: "TOOL", StartMs: 1, Attributes: "not json{"}, + {Kind: "LLM", StartMs: 2, Attributes: ""}, + {Kind: "TOOL", StartMs: 3, Attributes: `{"gen_ai.tool.name":"Read"}`}, + } + e := BuildEvidence(meta, spans) + if e.FinalMessage != "" { + t.Errorf("want empty final message from garbage, got %q", e.FinalMessage) + } + if len(e.ToolSequence) != 1 || e.ToolSequence[0] != "Read" { + t.Errorf("want only the well-formed tool, got %v", e.ToolSequence) + } +} + +func TestParse_ValidatesVersionAndStructure(t *testing.T) { + cases := []struct { + name string + yaml string + wantErr bool + }{ + {"good", ` +version: 1 +suites: + - name: triage-correctness + cases: + - name: needs-info + graders: + - type: text + contains: ["needs-info"] +`, false}, + {"bad version", "version: 2\nsuites: []\n", true}, + {"no suites", "version: 1\nsuites: []\n", true}, + {"suite no cases", "version: 1\nsuites:\n - name: s\n cases: []\n", true}, + {"unknown grader", ` +version: 1 +suites: + - name: s + cases: + - name: c + graders: + - type: prompt +`, true}, + {"case no graders", ` +version: 1 +suites: + - name: s + cases: + - name: c + graders: [] +`, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, err := Parse([]byte(tc.yaml)) + if (err != nil) != tc.wantErr { + t.Errorf("Parse err=%v, wantErr=%v", err, tc.wantErr) + } + }) + } +} + +func ev() *Evidence { + return &Evidence{ + SessionID: "s1", + Skill: "triage-issue", + Outcome: "success", + FinalMessage: "Labeled the issue needs-info because the requirement is ambiguous.", + ToolSequence: []string{"Read", "Bash", "Edit"}, + Skills: []string{"triage-issue"}, + Tools: 3, + Turns: 2, + DurationMs: 1500, + } +} + +func TestGraders(t *testing.T) { + cases := []struct { + name string + spec GraderSpec + want bool + }{ + {"outcome pass", GraderSpec{Type: "outcome", Expect: "success"}, true}, + {"outcome fail", GraderSpec{Type: "outcome", Expect: "failure"}, false}, + {"text contains pass", GraderSpec{Type: "text", Contains: []string{"needs-info"}}, true}, + {"text contains fail", GraderSpec{Type: "text", Contains: []string{"ready-to-implement"}}, false}, + {"text reject pass", GraderSpec{Type: "text", Reject: []string{"ready-to-implement"}}, true}, + {"text reject fail", GraderSpec{Type: "text", Reject: []string{"needs-info"}}, false}, + {"sequence pass", GraderSpec{Type: "tool_sequence", Sequence: []string{"Read", "Edit"}}, true}, + {"sequence fail (order)", GraderSpec{Type: "tool_sequence", Sequence: []string{"Edit", "Read"}}, false}, + {"constraint expect pass", GraderSpec{Type: "tool_constraint", ExpectTools: []string{"Bash"}}, true}, + {"constraint reject fail", GraderSpec{Type: "tool_constraint", RejectTools: []string{"Bash"}}, false}, + {"constraint max fail", GraderSpec{Type: "tool_constraint", MaxTools: 2}, false}, + {"skill expect pass", GraderSpec{Type: "skill_invocation", ExpectSkills: []string{"triage-issue"}}, true}, + {"skill reject fail", GraderSpec{Type: "skill_invocation", RejectSkills: []string{"triage-issue"}}, false}, + {"behavior pass", GraderSpec{Type: "behavior", MaxTools: 5, MaxTurns: 3}, true}, + {"behavior fail turns", GraderSpec{Type: "behavior", MaxTurns: 1}, false}, + {"behavior fail duration", GraderSpec{Type: "behavior", MaxDurationMs: 1000}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := runGrader(tc.spec, ev()) + if got.Pass != tc.want { + t.Errorf("grader %s: pass=%v, want %v (detail=%q)", tc.spec.Type, got.Pass, tc.want, got.Detail) + } + }) + } +} + +// TestGradeSuite_AllMustPass pins the AND semantics: one failing grader fails +// the case, and one failing case fails the suite. +func TestGradeSuite_AllMustPass(t *testing.T) { + s := &Suite{Name: "triage-correctness", Cases: []Case{ + {Name: "needs-info", Graders: []GraderSpec{ + {Type: "outcome", Expect: "success"}, + {Type: "text", Contains: []string{"needs-info"}, Reject: []string{"ready-to-implement"}}, + }}, + }} + if r := GradeSuite(s, ev()); !r.Pass || r.Passed != 1 || r.Failed != 0 { + t.Errorf("expected suite pass, got %+v", r) + } + + // Flip the evidence to the pre-improvement (misclassified) run. + bad := ev() + bad.FinalMessage = "Labeled the issue ready-to-implement." + if r := GradeSuite(s, bad); r.Pass || r.Failed != 1 { + t.Errorf("expected suite fail on misclassification, got %+v", r) + } +} diff --git a/internal/eval/evidence.go b/internal/eval/evidence.go new file mode 100644 index 0000000..11a4cdd --- /dev/null +++ b/internal/eval/evidence.go @@ -0,0 +1,95 @@ +package eval + +import ( + "encoding/json" + "slices" + "sort" + + "github.com/astra-sh/qvr/internal/ops/derive" + "github.com/astra-sh/qvr/internal/ops/store" +) + +// Evidence is the read model the graders judge: everything a deterministic +// grader needs about one captured session, distilled from its derived spans and +// session_meta. It deliberately carries no model output beyond the final +// assistant message — graders read behavior (tools, skills, outcome), not +// prose, except for the simple substring checks the text grader runs. +type Evidence struct { + SessionID string `json:"session_id"` + Skill string `json:"skill,omitempty"` + Outcome string `json:"outcome,omitempty"` // session-level qvr.outcome rollup + FinalMessage string `json:"final_message,omitempty"` + ToolSequence []string `json:"tool_sequence,omitempty"` // ordered tool names + Skills []string `json:"skills,omitempty"` // skills that fired + Tools int `json:"tools"` + Turns int `json:"turns"` + DurationMs int64 `json:"duration_ms"` +} + +// BuildEvidence distils a captured session into grader input. spans need not be +// pre-sorted; the tool sequence is ordered by span start time so an +// out-of-order store read can't scramble it. +func BuildEvidence(meta *store.SessionMetaRow, spans []*store.SpanRow) *Evidence { + e := &Evidence{ + SessionID: meta.SessionID.String(), + Outcome: meta.Outcome, + Skills: append([]string(nil), meta.Skills...), + Tools: int(meta.Tools), + Turns: int(meta.Turns), + DurationMs: meta.DurationMs(), + } + + ordered := append([]*store.SpanRow(nil), spans...) + sort.SliceStable(ordered, func(i, j int) bool { return ordered[i].StartMs < ordered[j].StartMs }) + + var lastLLMText string + for _, sp := range ordered { + attrs := decodeAttrs(sp.Attributes) + switch sp.Kind { + case derive.KindTool, derive.KindSkill: + if name, _ := attrs["gen_ai.tool.name"].(string); name != "" { + e.ToolSequence = append(e.ToolSequence, name) + } + case derive.KindLLM: + if msg := lastAssistantText(attrs); msg != "" { + lastLLMText = msg + } + } + } + e.FinalMessage = lastLLMText + return e +} + +// decodeAttrs parses a span's JSON attribute blob, or returns an empty map. +func decodeAttrs(raw string) map[string]any { + if raw == "" { + return map[string]any{} + } + var m map[string]any + if err := json.Unmarshal([]byte(raw), &m); err != nil { + return map[string]any{} + } + return m +} + +// lastAssistantText pulls the assistant message text out of an LLM span's +// gen_ai.output.messages attribute (a JSON-encoded [{role,content}] array). +func lastAssistantText(attrs map[string]any) string { + raw, _ := attrs["gen_ai.output.messages"].(string) + if raw == "" { + return "" + } + var msgs []struct { + Role string `json:"role"` + Content string `json:"content"` + } + if err := json.Unmarshal([]byte(raw), &msgs); err != nil { + return "" + } + for _, m := range slices.Backward(msgs) { + if m.Content != "" { + return m.Content + } + } + return "" +} diff --git a/internal/eval/grade.go b/internal/eval/grade.go new file mode 100644 index 0000000..cf968b0 --- /dev/null +++ b/internal/eval/grade.go @@ -0,0 +1,50 @@ +package eval + +// CaseResult is the verdict on one case: it passes only when every grader +// passed. The per-grader results are retained for diagnostics. +type CaseResult struct { + Case string `json:"case"` + Pass bool `json:"pass"` + Graders []GraderResult `json:"graders"` +} + +// SuiteResult aggregates a suite's case verdicts. +type SuiteResult struct { + Suite string `json:"suite"` + Pass bool `json:"pass"` + Passed int `json:"passed"` + Failed int `json:"failed"` + Cases []CaseResult `json:"cases"` +} + +// GradeCase runs every grader in a case against the evidence. The case passes +// only when all of them pass (AND semantics — a gate is as strong as its +// strictest check). +func GradeCase(c Case, e *Evidence) CaseResult { + r := CaseResult{Case: c.Name, Pass: true} + for _, g := range c.Graders { + gr := runGrader(g, e) + r.Graders = append(r.Graders, gr) + if !gr.Pass { + r.Pass = false + } + } + return r +} + +// GradeSuite grades every case in a suite against the same evidence and rolls +// up the pass/fail counts. The suite passes only when every case passes. +func GradeSuite(s *Suite, e *Evidence) SuiteResult { + res := SuiteResult{Suite: s.Name, Pass: true} + for _, c := range s.Cases { + cr := GradeCase(c, e) + res.Cases = append(res.Cases, cr) + if cr.Pass { + res.Passed++ + } else { + res.Failed++ + res.Pass = false + } + } + return res +} diff --git a/internal/eval/graders.go b/internal/eval/graders.go new file mode 100644 index 0000000..f1bfa7d --- /dev/null +++ b/internal/eval/graders.go @@ -0,0 +1,205 @@ +package eval + +import ( + "fmt" + "strings" +) + +// GraderResult is one grader's verdict on one case. +type GraderResult struct { + Type string `json:"type"` + Name string `json:"name,omitempty"` + Pass bool `json:"pass"` + Detail string `json:"detail,omitempty"` +} + +// graderFunc judges evidence against a spec. Every grader is pure: same +// (spec, evidence) → same result, so an eval run is reproducible. +type graderFunc func(GraderSpec, *Evidence) GraderResult + +// graders is the registry of deterministic, trace-based graders. Each reads the +// spans/meta qvr already captured — none runs an agent or calls a model. New +// grader types register here; the LLM-judge ("prompt") grader is deliberately +// NOT here — semantic judgement is layered on top as a skill, keeping the core +// gate model-free. +var graders = map[string]graderFunc{ + "outcome": gradeOutcome, + "text": gradeText, + "tool_sequence": gradeToolSequence, + "tool_constraint": gradeToolConstraint, + "skill_invocation": gradeSkillInvocation, + "behavior": gradeBehavior, +} + +// runGrader dispatches one grader spec. An unknown type fails closed (a gate +// that silently skips an unrecognized grader is worse than no gate); Load +// already rejects these, so this is the belt-and-braces path. +func runGrader(g GraderSpec, e *Evidence) GraderResult { + fn, ok := graders[g.Type] + if !ok { + return GraderResult{Type: g.Type, Name: g.Name, Pass: false, Detail: "unknown grader type"} + } + return fn(g, e) +} + +// graderValidators rejects a grader whose config would make it a no-op — a gate +// that can never fail is worse than no gate, because it reads as evidence while +// asserting nothing. Each grader must carry at least one real assertion. Kept as +// a registry (parallel to `graders`) so each check stays tiny. +var graderValidators = map[string]func(GraderSpec) error{ + "outcome": func(g GraderSpec) error { + return need(g.Expect != "", "needs `expect`") + }, + "text": func(g GraderSpec) error { + return need(len(g.Contains) > 0 || len(g.Reject) > 0, "needs `contains` or `reject`") + }, + "tool_sequence": func(g GraderSpec) error { + return need(len(g.Sequence) > 0, "needs `sequence`") + }, + "tool_constraint": func(g GraderSpec) error { + return need(len(g.ExpectTools) > 0 || len(g.RejectTools) > 0 || g.MaxTools > 0, + "needs `expectTools`, `rejectTools`, or `maxTools`") + }, + "skill_invocation": func(g GraderSpec) error { + return need(len(g.ExpectSkills) > 0 || len(g.RejectSkills) > 0, "needs `expectSkills` or `rejectSkills`") + }, + "behavior": func(g GraderSpec) error { + return need(g.MaxTools > 0 || g.MaxTurns > 0 || g.MaxDurationMs > 0, + "needs `maxTools`, `maxTurns`, or `maxDurationMs`") + }, +} + +// validateGraderConfig runs the registered config check for a grader type. +func validateGraderConfig(g GraderSpec) error { + if v, ok := graderValidators[g.Type]; ok { + return v(g) + } + return nil +} + +// need returns an error with msg unless ok. +func need(ok bool, msg string) error { + if ok { + return nil + } + return fmt.Errorf("%s", msg) +} + +func gradeOutcome(g GraderSpec, e *Evidence) GraderResult { + pass := e.Outcome == g.Expect + detail := "" + if !pass { + got := e.Outcome + if got == "" { + got = "unknown" + } + detail = fmt.Sprintf("outcome=%s, want %s", got, g.Expect) + } + return GraderResult{Type: g.Type, Name: g.Name, Pass: pass, Detail: detail} +} + +func gradeText(g GraderSpec, e *Evidence) GraderResult { + hay := evidenceField(g.On, e) + for _, want := range g.Contains { + if !strings.Contains(hay, want) { + return fail(g, fmt.Sprintf("missing %q", want)) + } + } + for _, bad := range g.Reject { + if strings.Contains(hay, bad) { + return fail(g, fmt.Sprintf("contains rejected %q", bad)) + } + } + return pass(g) +} + +// evidenceField selects the text body a text grader reads. final_message is the +// default and only field today; an unknown name falls back to it. +func evidenceField(name string, e *Evidence) string { + switch name { + case "", "final_message": + return e.FinalMessage + default: + return e.FinalMessage + } +} + +func gradeToolSequence(g GraderSpec, e *Evidence) GraderResult { + if isSubsequence(g.Sequence, e.ToolSequence) { + return pass(g) + } + return fail(g, fmt.Sprintf("sequence %v not found in %v", g.Sequence, e.ToolSequence)) +} + +func gradeToolConstraint(g GraderSpec, e *Evidence) GraderResult { + used := toSet(e.ToolSequence) + for _, want := range g.ExpectTools { + if !used[want] { + return fail(g, fmt.Sprintf("expected tool %q never used", want)) + } + } + for _, bad := range g.RejectTools { + if used[bad] { + return fail(g, fmt.Sprintf("rejected tool %q was used", bad)) + } + } + if g.MaxTools > 0 && len(e.ToolSequence) > g.MaxTools { + return fail(g, fmt.Sprintf("%d tool calls exceed max %d", len(e.ToolSequence), g.MaxTools)) + } + return pass(g) +} + +func gradeSkillInvocation(g GraderSpec, e *Evidence) GraderResult { + fired := toSet(e.Skills) + for _, want := range g.ExpectSkills { + if !fired[want] { + return fail(g, fmt.Sprintf("expected skill %q never fired", want)) + } + } + for _, bad := range g.RejectSkills { + if fired[bad] { + return fail(g, fmt.Sprintf("rejected skill %q fired", bad)) + } + } + return pass(g) +} + +func gradeBehavior(g GraderSpec, e *Evidence) GraderResult { + if g.MaxTools > 0 && e.Tools > g.MaxTools { + return fail(g, fmt.Sprintf("%d tools exceed max %d", e.Tools, g.MaxTools)) + } + if g.MaxTurns > 0 && e.Turns > g.MaxTurns { + return fail(g, fmt.Sprintf("%d turns exceed max %d", e.Turns, g.MaxTurns)) + } + if g.MaxDurationMs > 0 && e.DurationMs > g.MaxDurationMs { + return fail(g, fmt.Sprintf("%dms exceeds max %dms", e.DurationMs, g.MaxDurationMs)) + } + return pass(g) +} + +// --- small helpers --- + +func pass(g GraderSpec) GraderResult { return GraderResult{Type: g.Type, Name: g.Name, Pass: true} } +func fail(g GraderSpec, detail string) GraderResult { + return GraderResult{Type: g.Type, Name: g.Name, Pass: false, Detail: detail} +} + +func toSet(items []string) map[string]bool { + s := make(map[string]bool, len(items)) + for _, it := range items { + s[it] = true + } + return s +} + +// isSubsequence reports whether want appears in order (not necessarily +// contiguously) within have. An empty want trivially matches. +func isSubsequence(want, have []string) bool { + i := 0 + for _, h := range have { + if i < len(want) && h == want[i] { + i++ + } + } + return i == len(want) +} diff --git a/internal/eval/manifest.go b/internal/eval/manifest.go new file mode 100644 index 0000000..f7f3de4 --- /dev/null +++ b/internal/eval/manifest.go @@ -0,0 +1,157 @@ +// Package eval is qvr's deterministic, trace-based skill-evaluation substrate: +// it reads an evals.yaml manifest (a sibling of SKILL.md, qvr's own format — not +// part of the agentskills.io spec) and grades a skill's CAPTURED sessions +// against it. The grading is pure Go over the spans qvr already derived from +// real usage — no model calls, no agent execution, no sandbox replay. That keeps +// the gate inside qvr's "uv for skills" core; semantic LLM-judge grading and +// running a candidate skill against tasks are layered on top as skills, not +// built into this binary. +package eval + +import ( + "fmt" + "os" + "path/filepath" + + "gopkg.in/yaml.v3" +) + +// ManifestFile is the conventional eval manifest name, a sibling of SKILL.md. +const ManifestFile = "evals.yaml" + +// Manifest is a skill's eval definition: named suites of named cases, each a set +// of graders that must all pass for the case to pass. +type Manifest struct { + Version int `yaml:"version" json:"version"` + Suites []Suite `yaml:"suites" json:"suites"` +} + +// Suite is a named group of cases (e.g. "triage-correctness", "safety"). +type Suite struct { + Name string `yaml:"name" json:"name"` + Cases []Case `yaml:"cases" json:"cases"` +} + +// Case is one named expectation, satisfied only when every grader passes. +type Case struct { + Name string `yaml:"name" json:"name"` + Graders []GraderSpec `yaml:"graders" json:"graders"` +} + +// GraderSpec configures one grader. It is intentionally a flat struct: each +// grader type reads only the fields it cares about, which keeps YAML authoring +// simple and avoids polymorphic decoding. Unknown grader types fail loudly at +// run time rather than being silently skipped (a silently-skipped gate is worse +// than no gate). +type GraderSpec struct { + Type string `yaml:"type" json:"type"` + Name string `yaml:"name,omitempty" json:"name,omitempty"` + + // outcome grader: the session-level qvr.outcome must equal Expect. + Expect string `yaml:"expect,omitempty" json:"expect,omitempty"` + + // text grader: every Contains substring must appear and no Reject substring + // may appear in the field named by On (default: final_message). + On string `yaml:"on,omitempty" json:"on,omitempty"` + Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"` + Reject []string `yaml:"reject,omitempty" json:"reject,omitempty"` + + // tool_sequence grader: Sequence must appear as an ordered subsequence of + // the session's tool calls. + Sequence []string `yaml:"sequence,omitempty" json:"sequence,omitempty"` + + // tool_constraint grader: every ExpectTools tool must have been used, no + // RejectTools tool may have been used, and (when >0) the call count must not + // exceed MaxTools. + ExpectTools []string `yaml:"expectTools,omitempty" json:"expect_tools,omitempty"` + RejectTools []string `yaml:"rejectTools,omitempty" json:"reject_tools,omitempty"` + + // skill_invocation grader: ExpectSkills must all have fired, RejectSkills + // none. + ExpectSkills []string `yaml:"expectSkills,omitempty" json:"expect_skills,omitempty"` + RejectSkills []string `yaml:"rejectSkills,omitempty" json:"reject_skills,omitempty"` + + // behavior grader: efficiency ceilings (0 = ignored). + MaxTools int `yaml:"maxTools,omitempty" json:"max_tools,omitempty"` + MaxTurns int `yaml:"maxTurns,omitempty" json:"max_turns,omitempty"` + MaxDurationMs int64 `yaml:"maxDurationMs,omitempty" json:"max_duration_ms,omitempty"` +} + +// Load reads and validates the evals.yaml in skillDir. Returns (nil, nil) when +// the skill has no manifest — an un-evaluated skill is not an error. +func Load(skillDir string) (*Manifest, error) { + path := filepath.Join(skillDir, ManifestFile) + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, fmt.Errorf("read %s: %w", path, err) + } + return Parse(data) +} + +// Parse decodes and validates a manifest from raw YAML bytes. +func Parse(data []byte) (*Manifest, error) { + var m Manifest + if err := yaml.Unmarshal(data, &m); err != nil { + return nil, fmt.Errorf("parse evals.yaml: %w", err) + } + if err := m.validate(); err != nil { + return nil, err + } + return &m, nil +} + +// validate enforces the structural invariants: a supported version, named +// non-empty suites and cases, and only known grader types. +func (m *Manifest) validate() error { + if m.Version != 1 { + return fmt.Errorf("evals.yaml: unsupported version %d (want 1)", m.Version) + } + if len(m.Suites) == 0 { + return fmt.Errorf("evals.yaml: no suites defined") + } + for _, s := range m.Suites { + if s.Name == "" { + return fmt.Errorf("evals.yaml: a suite has no name") + } + if len(s.Cases) == 0 { + return fmt.Errorf("evals.yaml: suite %q has no cases", s.Name) + } + for _, c := range s.Cases { + if err := validateCase(s.Name, c); err != nil { + return err + } + } + } + return nil +} + +func validateCase(suite string, c Case) error { + if c.Name == "" { + return fmt.Errorf("evals.yaml: a case in suite %q has no name", suite) + } + if len(c.Graders) == 0 { + return fmt.Errorf("evals.yaml: case %q/%q has no graders", suite, c.Name) + } + for _, g := range c.Graders { + if _, ok := graders[g.Type]; !ok { + return fmt.Errorf("evals.yaml: case %q/%q uses unknown grader type %q", suite, c.Name, g.Type) + } + if err := validateGraderConfig(g); err != nil { + return fmt.Errorf("evals.yaml: case %q/%q %s grader %w", suite, c.Name, g.Type, err) + } + } + return nil +} + +// Suite returns the named suite, or nil when absent. +func (m *Manifest) Suite(name string) *Suite { + for i := range m.Suites { + if m.Suites[i].Name == name { + return &m.Suites[i] + } + } + return nil +} diff --git a/internal/eval/runner.go b/internal/eval/runner.go new file mode 100644 index 0000000..3403145 --- /dev/null +++ b/internal/eval/runner.go @@ -0,0 +1,107 @@ +package eval + +import ( + "context" + "fmt" + + "github.com/astra-sh/qvr/internal/ops/store" + "github.com/google/uuid" +) + +// RunInput parameterizes one eval run. SessionID pins which captured session to +// grade; when nil, the skill's most recent session is used (the "online eval" +// default — grade the latest real run). Suite "" grades every suite. +type RunInput struct { + SkillName string + SkillDir string + Suite string + SessionID *uuid.UUID +} + +// RunResult is the outcome of grading one session against the manifest. +type RunResult struct { + Skill string `json:"skill"` + SessionID string `json:"session_id"` + Pass bool `json:"pass"` + Passed int `json:"passed"` + Failed int `json:"failed"` + Suites []SuiteResult `json:"suites"` +} + +// Run loads the skill's evals.yaml, grades a captured session against the +// requested suite(s), and returns the result. It NEVER runs an agent or calls a +// model — it grades spans qvr already captured. Writing the result to the +// eval-run store (the lock × evidence join) is the caller's job. +func Run(ctx context.Context, s store.Store, in RunInput) (*RunResult, error) { + man, err := Load(in.SkillDir) + if err != nil { + return nil, err + } + if man == nil { + return nil, fmt.Errorf("skill %q has no %s in %s", in.SkillName, ManifestFile, in.SkillDir) + } + + suites, err := selectSuites(man, in.Suite) + if err != nil { + return nil, err + } + + sess, err := resolveSession(ctx, s, in) + if err != nil { + return nil, err + } + spans, err := s.QuerySpans(ctx, &store.SpanFilter{SessionID: &sess.SessionID}) + if err != nil { + return nil, fmt.Errorf("load spans for %s: %w", sess.SessionID, err) + } + ev := BuildEvidence(sess, spans) + ev.Skill = in.SkillName + + res := &RunResult{Skill: in.SkillName, SessionID: ev.SessionID, Pass: true} + for i := range suites { + sr := GradeSuite(&suites[i], ev) + res.Suites = append(res.Suites, sr) + res.Passed += sr.Passed + res.Failed += sr.Failed + if !sr.Pass { + res.Pass = false + } + } + return res, nil +} + +// selectSuites resolves the requested suite name to a slice — one named suite, +// or all of them when name is empty. +func selectSuites(man *Manifest, name string) ([]Suite, error) { + if name == "" { + return man.Suites, nil + } + su := man.Suite(name) + if su == nil { + return nil, fmt.Errorf("no suite %q in evals.yaml", name) + } + return []Suite{*su}, nil +} + +// resolveSession finds the session to grade: the pinned id, or the skill's most +// recent captured session. +func resolveSession(ctx context.Context, s store.Store, in RunInput) (*store.SessionMetaRow, error) { + if in.SessionID != nil { + m, err := s.GetSessionMeta(ctx, *in.SessionID) + if err != nil { + return nil, fmt.Errorf("look up session %s: %w", in.SessionID, err) + } + if m == nil { + return nil, fmt.Errorf("no captured session %s", in.SessionID) + } + return m, nil + } + rows, err := s.ListSessionMeta(ctx, &store.SessionMetaFilter{Skill: in.SkillName, Limit: 1}) + if err != nil { + return nil, fmt.Errorf("find latest session for %q: %w", in.SkillName, err) + } + if len(rows) == 0 { + return nil, fmt.Errorf("no captured sessions for skill %q to evaluate — run it, then `qvr audit discover`", in.SkillName) + } + return rows[0], nil +} diff --git a/internal/ops/derive/claude.go b/internal/ops/derive/claude.go index 70e14f6..e7cfe81 100644 --- a/internal/ops/derive/claude.go +++ b/internal/ops/derive/claude.go @@ -305,10 +305,12 @@ func (t *turn) applyToolResult(b claudeBlock, ts int64) { return } sp := &t.tools[idx] - sp.Attributes["gen_ai.tool.call.result"] = decodeToolResultText(b.Content) + result := decodeToolResultText(b.Content) + sp.Attributes["gen_ai.tool.call.result"] = result if b.IsError { sp.Attributes["error.type"] = "tool_failure" } + sp.Attributes[OutcomeKey] = classifyOutcome(result, b.IsError) if ts > sp.StartMs { sp.EndMs = ts } diff --git a/internal/ops/derive/derive.go b/internal/ops/derive/derive.go index cb512b6..6d46748 100644 --- a/internal/ops/derive/derive.go +++ b/internal/ops/derive/derive.go @@ -42,7 +42,14 @@ import ( // resultDisplay (string OR object) no longer drops the whole item. Dispatch // is also mixed-agent safe: the majority transcript agent owns the session, // so a stale external writer's foreign rows can't blank it. -const Version = 7 +// v8: outcome signal — TOOL/SKILL spans carry qvr.outcome +// (success/failure/blocked; absent = unknown), derived from the result's error +// flag plus a denial/interrupt marker for blocked (hook-deny lives in hook +// payloads the deriver never sees, so transcript-silent denial is failure). +// SessionMeta carries a worst-of-spans rollup (Outcome), and the OTLP span +// status code now reflects it (ERROR for failure/blocked) instead of a +// hardcoded OK. Rederive backfills outcome onto historical sessions. +const Version = 8 // KindSkill is the Quiver span category for a skill load/invocation within a // turn. It exists so the trace makes skill usage a first-class, queryable stage @@ -79,6 +86,10 @@ type SessionMeta struct { // only at session level (hermes, copilot), and a deriver-set value wins. TokensIn *int64 TokensOut *int64 + // Outcome is the session-level rollup of its TOOL/SKILL spans' qvr.outcome: + // the worst seen (failure > blocked > success), or "" when no span carried an + // outcome (rendered unknown, never a fabricated success). + Outcome string } // Derivation is one session's full derived projection: the unified meta plus @@ -211,6 +222,7 @@ func finalizeMeta(d *Derivation, rows []*ops.RawTrace) { } fillMetaFromSpans(m, d.Spans) fillMetaTokens(m, d.Spans) + fillMetaOutcome(m, d.Spans) // Formats without in-file timestamps derive zero time bounds; fall back to // the capture times so the session still buckets into the right day. if m.StartedMs == 0 && len(rows) > 0 { @@ -288,6 +300,36 @@ func fillMetaTokens(m *SessionMeta, spans []Span) { } } +// outcomeRank orders outcomes by severity for the session rollup: a worse +// outcome on any span dominates. Unknown/absent ranks below an explicit +// success, so a session with one graded success reads success, not unknown. +var outcomeRank = map[string]int{ + OutcomeSuccess: 1, + OutcomeBlocked: 2, + OutcomeFailure: 3, +} + +// fillMetaOutcome rolls the TOOL/SKILL spans' qvr.outcome into a single +// session-level verdict: the worst outcome any span carried. Spans without an +// outcome (LLM turns, calls whose result was never seen) don't vote; a session +// where none voted stays "" (unknown), never a fabricated success. +func fillMetaOutcome(m *SessionMeta, spans []Span) { + best, bestRank := "", 0 + for _, sp := range spans { + if sp.Kind != KindTool && sp.Kind != KindSkill { + continue + } + v, ok := sp.Attributes[OutcomeKey].(string) + if !ok { + continue + } + if r := outcomeRank[v]; r > bestRank { + best, bestRank = v, r + } + } + m.Outcome = best +} + // addAttrTokens folds one span's token attribute value into a running // nullable sum; an absent attribute leaves the sum untouched (and nil). func addAttrTokens(sum **int64, v any) { diff --git a/internal/ops/derive/outcome_test.go b/internal/ops/derive/outcome_test.go new file mode 100644 index 0000000..4001f2f --- /dev/null +++ b/internal/ops/derive/outcome_test.go @@ -0,0 +1,156 @@ +package derive_test + +import ( + "testing" + + "github.com/astra-sh/qvr/internal/ops" + "github.com/astra-sh/qvr/internal/ops/derive" + "github.com/google/uuid" +) + +// Outcome contract (deriver v8): +// - TOOL/SKILL spans carry qvr.outcome ONLY once their result arrives — +// success when the result is not an error, failure when it is, and blocked +// when the error text is a user denial/interrupt (a governance signal). +// - A call whose result is never seen carries NO outcome (absence = unknown); +// we never fabricate a success. +// - SessionMeta.Outcome is the worst-of-spans rollup. +// - The OTLP span status code reflects the outcome (ERROR for failure/blocked) +// instead of the old hardcoded OK. + +// claudeToolRun builds a minimal claude session: a prompt, a Bash tool_use, and +// its tool_result with the given text + error flag. +func claudeToolRun(sid uuid.UUID, result string, isError bool) []*ops.RawTrace { + errField := "" + if isError { + errField = `,"is_error":true` + } + return []*ops.RawTrace{ + row(sid, 0, `{"type":"user","timestamp":"2026-06-12T00:00:00.000Z","message":{"role":"user","content":"run it"}}`), + row(sid, 1, `{"type":"assistant","timestamp":"2026-06-12T00:00:01.000Z","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"ls"}}]}}`), + row(sid, 2, `{"type":"user","timestamp":"2026-06-12T00:00:02.000Z","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"tu1","content":`+jsonQuote(result)+errField+`}]}}`), + } +} + +// jsonQuote returns a JSON string literal for s. +func jsonQuote(s string) string { + out := []byte{'"'} + for _, r := range s { + switch r { + case '"', '\\': + out = append(out, '\\', byte(r)) + case '\n': + out = append(out, '\\', 'n') + default: + out = append(out, []byte(string(r))...) + } + } + return string(append(out, '"')) +} + +func TestOutcome_Success(t *testing.T) { + sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa01") + d, err := derive.DeriveSession(claudeToolRun(sid, "done", false)) + if err != nil { + t.Fatalf("derive: %v", err) + } + _, tool, _ := splitSpanKinds(d.Spans) + wantAttrs(t, tool, map[string]any{derive.OutcomeKey: derive.OutcomeSuccess}) + if d.Meta.Outcome != derive.OutcomeSuccess { + t.Errorf("meta outcome = %q, want %q", d.Meta.Outcome, derive.OutcomeSuccess) + } +} + +func TestOutcome_Failure(t *testing.T) { + sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa02") + d, err := derive.DeriveSession(claudeToolRun(sid, "command not found", true)) + if err != nil { + t.Fatalf("derive: %v", err) + } + _, tool, _ := splitSpanKinds(d.Spans) + wantAttrs(t, tool, map[string]any{derive.OutcomeKey: derive.OutcomeFailure}) + if d.Meta.Outcome != derive.OutcomeFailure { + t.Errorf("meta outcome = %q, want %q", d.Meta.Outcome, derive.OutcomeFailure) + } + // The OTLP status code must now be ERROR(2), not the old hardcoded OK(1). + if code := otlpStatusForSpan(derive.ToOTLP(d.Spans), tool.SpanID); code != 2 { + t.Errorf("OTLP status code = %v, want 2 (ERROR) for a failed tool call", code) + } +} + +func TestOutcome_Blocked(t *testing.T) { + sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa03") + // A user-rejected tool call: is_error with the harness denial text. + d, err := derive.DeriveSession(claudeToolRun(sid, + "The user doesn't want to proceed with this tool use. The tool use was rejected.", true)) + if err != nil { + t.Fatalf("derive: %v", err) + } + _, tool, _ := splitSpanKinds(d.Spans) + wantAttrs(t, tool, map[string]any{derive.OutcomeKey: derive.OutcomeBlocked}) + if d.Meta.Outcome != derive.OutcomeBlocked { + t.Errorf("meta outcome = %q, want %q", d.Meta.Outcome, derive.OutcomeBlocked) + } +} + +// TestOutcome_PendingIsUnknown pins the absence invariant: a tool call whose +// result never arrives carries no outcome, and the session rolls up to "". +func TestOutcome_PendingIsUnknown(t *testing.T) { + sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa04") + rows := []*ops.RawTrace{ + row(sid, 0, `{"type":"user","timestamp":"2026-06-12T00:00:00.000Z","message":{"role":"user","content":"run it"}}`), + row(sid, 1, `{"type":"assistant","timestamp":"2026-06-12T00:00:01.000Z","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"ls"}}]}}`), + } + d, err := derive.DeriveSession(rows) + if err != nil { + t.Fatalf("derive: %v", err) + } + _, tool, _ := splitSpanKinds(d.Spans) + if v, ok := tool.Attributes[derive.OutcomeKey]; ok { + t.Errorf("qvr.outcome present (%v) on a call with no result — fabricated outcome", v) + } + if d.Meta.Outcome != "" { + t.Errorf("meta outcome = %q, want \"\" (unknown) when no span carried an outcome", d.Meta.Outcome) + } +} + +// TestOutcome_SessionWorstOf pins the rollup: one success and one failure in the +// same session roll up to failure (the worst). +func TestOutcome_SessionWorstOf(t *testing.T) { + sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa05") + rows := []*ops.RawTrace{ + row(sid, 0, `{"type":"user","timestamp":"2026-06-12T00:00:00.000Z","message":{"role":"user","content":"run both"}}`), + row(sid, 1, `{"type":"assistant","timestamp":"2026-06-12T00:00:01.000Z","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"ok"}},{"type":"tool_use","id":"tu2","name":"Bash","input":{"command":"bad"}}]}}`), + row(sid, 2, `{"type":"user","timestamp":"2026-06-12T00:00:02.000Z","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"tu1","content":"fine"}]}}`), + row(sid, 3, `{"type":"user","timestamp":"2026-06-12T00:00:03.000Z","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"tu2","content":"boom","is_error":true}]}}`), + } + d, err := derive.DeriveSession(rows) + if err != nil { + t.Fatalf("derive: %v", err) + } + if d.Meta.Outcome != derive.OutcomeFailure { + t.Errorf("meta outcome = %q, want %q (worst-of success+failure)", d.Meta.Outcome, derive.OutcomeFailure) + } +} + +// otlpStatusForSpan digs the status.code out of a ToOTLP envelope for one span, +// or -1 when not found. +func otlpStatusForSpan(env map[string]any, spanID string) int { + rs, _ := env["resourceSpans"].([]map[string]any) + for _, r := range rs { + ss, _ := r["scopeSpans"].([]map[string]any) + for _, sc := range ss { + spans, _ := sc["spans"].([]map[string]any) + for _, sp := range spans { + if sp["spanId"] == spanID { + if st, ok := sp["status"].(map[string]any); ok { + if c, ok := st["code"].(int); ok { + return c + } + } + } + } + } + } + return -1 +} diff --git a/internal/ops/derive/shared.go b/internal/ops/derive/shared.go index acd0664..4bbd922 100644 --- a/internal/ops/derive/shared.go +++ b/internal/ops/derive/shared.go @@ -234,6 +234,7 @@ func applyResultTo(sp *Span, result string, ts int64, isError bool) { if isError { sp.Attributes["error.type"] = "tool_failure" } + sp.Attributes[OutcomeKey] = classifyOutcome(result, isError) if ts > sp.StartMs { sp.EndMs = ts } diff --git a/internal/ops/derive/span.go b/internal/ops/derive/span.go index 2aa4e35..5724c41 100644 --- a/internal/ops/derive/span.go +++ b/internal/ops/derive/span.go @@ -17,6 +17,7 @@ package derive import ( "encoding/json" "strconv" + "strings" ) // Default resource/scope identifiers stamped on emitted spans. Vendor-neutral. @@ -35,6 +36,62 @@ const ( KindChain = "CHAIN" ) +// OutcomeKey is the Quiver per-span result attribute on TOOL/SKILL spans: did +// the call succeed, fail, or get blocked? It is derived purely from the +// transcript — a tool result's error flag and, for a denied/interrupted call, +// its text — so it stays a regenerable projection. Its ABSENCE means unknown +// (the call never reported a result); we never fabricate a success. The session +// read model rolls these up into SessionMeta.Outcome (worst-of-spans). +const OutcomeKey = "qvr.outcome" + +const ( + OutcomeSuccess = "success" + OutcomeFailure = "failure" + OutcomeBlocked = "blocked" + OutcomeUnknown = "unknown" +) + +// classifyOutcome maps a tool result to an outcome. A non-error result is a +// success; an error the transcript marks as a user denial or interrupt is +// blocked (a governance signal, not a tool defect); any other error is a +// failure. +func classifyOutcome(result string, isError bool) string { + if !isError { + return OutcomeSuccess + } + if isBlockedResult(result) { + return OutcomeBlocked + } + return OutcomeFailure +} + +// blockedResultMarkers are substrings a harness writes into a tool result when +// the user REJECTED or INTERRUPTED the call rather than the tool failing on its +// own (observed in real Claude Code stores). Matched case-insensitively. This +// is the only "blocked" source available to a pure deriver: hook-deny decisions +// live in hook payloads, which the deriver never sees (capture filters to +// transcript rows), so denial that a transcript doesn't echo derives to failure. +var blockedResultMarkers = []string{ + "the user doesn't want to proceed", + "tool use was rejected", + "user doesn't want to take this action", + "request interrupted by user", + "[request interrupted", +} + +func isBlockedResult(result string) bool { + if result == "" { + return false + } + low := strings.ToLower(result) + for _, m := range blockedResultMarkers { + if strings.Contains(low, m) { + return true + } + } + return false +} + // Span is one derived span. Times are epoch milliseconds. Attributes use OTel // GenAI semantic-convention keys (e.g. "gen_ai.operation.name", // "gen_ai.request.model", "gen_ai.usage.input_tokens") plus the Quiver @@ -116,7 +173,7 @@ func (s Span) otlpSpan() map[string]any { "startTimeUnixNano": msToNano(s.StartMs), "endTimeUnixNano": msToNano(end), "attributes": attrs, - "status": map[string]any{"code": 1}, + "status": map[string]any{"code": otlpStatusCode(s.Attributes)}, } if s.ParentSpanID != "" { obj["parentSpanId"] = s.ParentSpanID @@ -124,6 +181,22 @@ func (s Span) otlpSpan() map[string]any { return obj } +// otlpStatusCode maps a span's outcome to an OTLP status code: a failed or +// blocked call is ERROR(2), an explicit success is OK(1), and anything without +// an outcome (LLM turns, calls whose result was never seen) is UNSET(0), the +// OTel default. Before this, every span emitted OK(1) unconditionally — a +// failed tool call looked successful to any OTLP backend. +func otlpStatusCode(attrs map[string]any) int { + switch attrs[OutcomeKey] { + case OutcomeFailure, OutcomeBlocked: + return 2 + case OutcomeSuccess: + return 1 + default: + return 0 + } +} + // ToOTLP renders a batch of spans as a single OTLP resourceSpans payload. For // an empty batch it returns a well-formed envelope with an empty resourceSpans // array — never a nil/`null` body — so a consumer always receives valid OTLP diff --git a/internal/ops/rawtrace/capture.go b/internal/ops/rawtrace/capture.go index 92da4aa..8e790d7 100644 --- a/internal/ops/rawtrace/capture.go +++ b/internal/ops/rawtrace/capture.go @@ -134,6 +134,7 @@ func persistDerivation(ctx context.Context, s Store, sessionID uuid.UUID) (int, Skills: d.Meta.Skills, TokensIn: d.Meta.TokensIn, TokensOut: d.Meta.TokensOut, + Outcome: d.Meta.Outcome, DeriverVersion: derive.Version, } return len(out), hasSkill, s.ReplaceSessionDerivation(ctx, meta, out) diff --git a/internal/ops/store/annotations.go b/internal/ops/store/annotations.go new file mode 100644 index 0000000..c3db406 --- /dev/null +++ b/internal/ops/store/annotations.go @@ -0,0 +1,118 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/google/uuid" +) + +// AnnotationRow is one human verdict on a captured session — the outer-loop +// feedback signal. It is durable: derivation (`qvr audit rederive`) never +// rewrites it, unlike the spans and session_meta projections. A whole-session +// verdict leaves Skill empty; a per-skill verdict scopes it to one skill. +type AnnotationRow struct { + SessionID uuid.UUID `json:"session_id"` + Skill string `json:"skill,omitempty"` + Outcome string `json:"outcome"` // the reviewer's verdict (e.g. good, bad, blocked) + Note string `json:"note,omitempty"` + Author string `json:"author,omitempty"` + CreatedAt time.Time `json:"created_at"` +} + +// AnnotationFilter scopes ListAnnotations. Nil/zero fields are ignored; the +// result is newest-first by created_at. +type AnnotationFilter struct { + SessionID *uuid.UUID + Skill string + Since *time.Time +} + +// PutAnnotation appends one human verdict. Annotations are append-only — a +// session can accrue several over time — so this never replaces a prior row. +func (s *sqliteStore) PutAnnotation(ctx context.Context, a *AnnotationRow) error { + if a == nil { + return fmt.Errorf("store: nil annotation") + } + if a.Outcome == "" { + return fmt.Errorf("store: annotation outcome is required") + } + if a.CreatedAt.IsZero() { + a.CreatedAt = time.Now().UTC() + } + _, err := s.db.ExecContext(ctx, + `INSERT INTO annotations(session_id, skill, outcome, note, author, created_at) + VALUES (?,?,?,?,?,?)`, + a.SessionID.String(), nullableString(a.Skill), a.Outcome, + nullableString(a.Note), nullableString(a.Author), a.CreatedAt.UTC()) + if err != nil { + return fmt.Errorf("store: put annotation: %w", err) + } + return nil +} + +// ListAnnotations returns the verdicts matching the filter, newest-first. +func (s *sqliteStore) ListAnnotations(ctx context.Context, f *AnnotationFilter) ([]*AnnotationRow, error) { + var where []string + var args []any + if f != nil { + if f.SessionID != nil { + where = append(where, "session_id = ?") + args = append(args, f.SessionID.String()) + } + if f.Skill != "" { + where = append(where, "skill = ?") + args = append(args, f.Skill) + } + if f.Since != nil { + where = append(where, "created_at >= ?") + args = append(args, f.Since.UTC()) + } + } + q := `SELECT session_id, skill, outcome, note, author, created_at FROM annotations` + if len(where) > 0 { + q += " WHERE " + strings.Join(where, " AND ") + } + q += " ORDER BY created_at DESC" + + rows, err := s.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, fmt.Errorf("store: list annotations: %w", err) + } + defer rows.Close() + + var out []*AnnotationRow + for rows.Next() { + a, err := scanAnnotation(rows) + if err != nil { + return nil, err + } + out = append(out, a) + } + return out, rows.Err() +} + +func scanAnnotation(row interface{ Scan(...any) error }) (*AnnotationRow, error) { + var ( + a AnnotationRow + sid string + skill, note, author sql.NullString + createdAt time.Time + ) + if err := row.Scan(&sid, &skill, &a.Outcome, ¬e, &author, &createdAt); err != nil { + return nil, fmt.Errorf("store: scan annotation: %w", err) + } + id, err := uuid.Parse(sid) + if err != nil { + return nil, fmt.Errorf("store: bad annotation session_id %q: %w", sid, err) + } + a.SessionID = id + a.Skill = skill.String + a.Note = note.String + a.Author = author.String + a.CreatedAt = createdAt.UTC() + return &a, nil +} diff --git a/internal/ops/store/annotations_test.go b/internal/ops/store/annotations_test.go new file mode 100644 index 0000000..7cfae24 --- /dev/null +++ b/internal/ops/store/annotations_test.go @@ -0,0 +1,104 @@ +package store + +import ( + "context" + "testing" + + "github.com/google/uuid" +) + +// TestAnnotations_RoundTrip pins the write/read path: a verdict reads back with +// its fields, and filters scope by session and skill. +func TestAnnotations_RoundTrip(t *testing.T) { + st := openTestStore(t) + ctx := context.Background() + sid := uuid.New() + + if err := st.PutAnnotation(ctx, &AnnotationRow{ + SessionID: sid, Skill: "triage-issue", Outcome: "bad", + Note: "ambiguous: needs a setting", Author: "rakshith", + }); err != nil { + t.Fatalf("put annotation: %v", err) + } + + got, err := st.ListAnnotations(ctx, &AnnotationFilter{SessionID: &sid}) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(got) != 1 { + t.Fatalf("want 1 annotation, got %d", len(got)) + } + a := got[0] + if a.Skill != "triage-issue" || a.Outcome != "bad" || + a.Note != "ambiguous: needs a setting" || a.Author != "rakshith" { + t.Errorf("annotation did not round-trip: %+v", a) + } + if a.CreatedAt.IsZero() { + t.Error("created_at was not stamped") + } + + // Skill filter matches the per-skill verdict. + bySkill, _ := st.ListAnnotations(ctx, &AnnotationFilter{Skill: "triage-issue"}) + if len(bySkill) != 1 { + t.Errorf("skill filter: want 1, got %d", len(bySkill)) + } + none, _ := st.ListAnnotations(ctx, &AnnotationFilter{Skill: "other"}) + if len(none) != 0 { + t.Errorf("skill filter (other): want 0, got %d", len(none)) + } +} + +// TestAnnotations_RequiresOutcome pins the validation guard. +func TestAnnotations_RequiresOutcome(t *testing.T) { + st := openTestStore(t) + if err := st.PutAnnotation(context.Background(), &AnnotationRow{SessionID: uuid.New()}); err == nil { + t.Error("expected an error for a verdict with no outcome") + } +} + +// TestAnnotations_SurviveRederive proves the whole point of a separate table: +// re-deriving a session (which replaces spans + session_meta) does NOT touch +// the human verdict. +func TestAnnotations_SurviveRederive(t *testing.T) { + st := openTestStore(t) + ctx := context.Background() + sid := uuid.New() + + if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "claude-code", 1000, "triage-issue"), nil); err != nil { + t.Fatalf("derive: %v", err) + } + if err := st.PutAnnotation(ctx, &AnnotationRow{SessionID: sid, Skill: "triage-issue", Outcome: "bad"}); err != nil { + t.Fatalf("annotate: %v", err) + } + // Re-derive (the rederive path uses this same write). + if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "claude-code", 1000, "triage-issue"), nil); err != nil { + t.Fatalf("re-derive: %v", err) + } + + got, _ := st.ListAnnotations(ctx, &AnnotationFilter{SessionID: &sid}) + if len(got) != 1 { + t.Fatalf("annotation did not survive rederive: got %d", len(got)) + } +} + +// TestAnnotations_ClearedOnDeleteSession proves a hard session purge removes +// its verdicts (nothing left to annotate). +func TestAnnotations_ClearedOnDeleteSession(t *testing.T) { + st := openTestStore(t) + ctx := context.Background() + sid := uuid.New() + + if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "codex", 1000, "tdd"), nil); err != nil { + t.Fatalf("derive: %v", err) + } + if err := st.PutAnnotation(ctx, &AnnotationRow{SessionID: sid, Outcome: "good"}); err != nil { + t.Fatalf("annotate: %v", err) + } + if _, err := st.DeleteSession(ctx, sid); err != nil { + t.Fatalf("delete: %v", err) + } + got, _ := st.ListAnnotations(ctx, &AnnotationFilter{SessionID: &sid}) + if len(got) != 0 { + t.Errorf("annotations survived DeleteSession: %d", len(got)) + } +} diff --git a/internal/ops/store/eval_runs.go b/internal/ops/store/eval_runs.go new file mode 100644 index 0000000..26da6de --- /dev/null +++ b/internal/ops/store/eval_runs.go @@ -0,0 +1,177 @@ +package store + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" +) + +// EvalRunRow is one recorded eval run, keyed by {skill_name, skill_commit} so it +// joins to spans/lineage by the exact locked commit that ran. Cases carry the +// per-case verdicts; they are written with the run and loaded on demand. +type EvalRunRow struct { + ID int64 `json:"id"` + SkillName string `json:"skill_name"` + SkillCommit string `json:"skill_commit"` + Suite string `json:"suite"` + SessionID string `json:"session_id,omitempty"` + StartedAt time.Time `json:"started_at"` + Passed int `json:"passed"` + Failed int `json:"failed"` + Pass bool `json:"pass"` + Cases []EvalCaseRow `json:"cases,omitempty"` +} + +// EvalCaseRow is one case's verdict within a run. +type EvalCaseRow struct { + Suite string `json:"suite"` + Case string `json:"case"` + Pass bool `json:"pass"` + Detail string `json:"detail,omitempty"` +} + +// EvalRunFilter scopes ListEvalRuns. Nil/zero fields are ignored; results are +// newest-first by started_at. +type EvalRunFilter struct { + SkillName string + SkillCommit string + Since *time.Time + Limit int +} + +// PutEvalRun inserts a run and its case rows in one tx, returning the new run id. +func (s *sqliteStore) PutEvalRun(ctx context.Context, r *EvalRunRow) (int64, error) { + if r == nil { + return 0, fmt.Errorf("store: nil eval run") + } + if r.StartedAt.IsZero() { + r.StartedAt = time.Now().UTC() + } + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return 0, fmt.Errorf("store: eval run tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + res, err := tx.ExecContext(ctx, + `INSERT INTO eval_runs(skill_name, skill_commit, suite, session_id, started_at, passed, failed, pass) + VALUES (?,?,?,?,?,?,?,?)`, + r.SkillName, r.SkillCommit, r.Suite, nullableString(r.SessionID), + r.StartedAt.UTC(), r.Passed, r.Failed, boolToInt(r.Pass)) + if err != nil { + return 0, fmt.Errorf("store: insert eval run: %w", err) + } + id, _ := res.LastInsertId() + for _, c := range r.Cases { + if _, err := tx.ExecContext(ctx, + `INSERT INTO eval_case_results(eval_run_id, suite, case_name, pass, detail) + VALUES (?,?,?,?,?)`, + id, c.Suite, c.Case, boolToInt(c.Pass), nullableString(c.Detail)); err != nil { + return 0, fmt.Errorf("store: insert eval case: %w", err) + } + } + if err := tx.Commit(); err != nil { + return 0, fmt.Errorf("store: eval run commit: %w", err) + } + r.ID = id + return id, nil +} + +// ListEvalRuns returns runs matching the filter, newest-first, each with its +// case rows loaded. +func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*EvalRunRow, error) { + var where []string + var args []any + if f != nil { + if f.SkillName != "" { + where = append(where, "skill_name = ?") + args = append(args, f.SkillName) + } + if f.SkillCommit != "" { + where = append(where, "skill_commit = ?") + args = append(args, f.SkillCommit) + } + if f.Since != nil { + where = append(where, "started_at >= ?") + args = append(args, f.Since.UTC()) + } + } + q := `SELECT id, skill_name, skill_commit, suite, session_id, started_at, passed, failed, pass FROM eval_runs` + if len(where) > 0 { + q += " WHERE " + strings.Join(where, " AND ") + } + q += " ORDER BY started_at DESC" + if f != nil && f.Limit > 0 { + q += " LIMIT ?" + args = append(args, f.Limit) + } + + rows, err := s.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, fmt.Errorf("store: list eval runs: %w", err) + } + defer rows.Close() + + var out []*EvalRunRow + for rows.Next() { + r, err := scanEvalRun(rows) + if err != nil { + return nil, err + } + out = append(out, r) + } + if err := rows.Err(); err != nil { + return nil, err + } + for _, r := range out { + cases, err := s.evalCases(ctx, r.ID) + if err != nil { + return nil, err + } + r.Cases = cases + } + return out, nil +} + +func (s *sqliteStore) evalCases(ctx context.Context, runID int64) ([]EvalCaseRow, error) { + rows, err := s.db.QueryContext(ctx, + `SELECT suite, case_name, pass, detail FROM eval_case_results WHERE eval_run_id = ?`, runID) + if err != nil { + return nil, fmt.Errorf("store: list eval cases: %w", err) + } + defer rows.Close() + var out []EvalCaseRow + for rows.Next() { + var ( + c EvalCaseRow + p int + detail sql.NullString + ) + if err := rows.Scan(&c.Suite, &c.Case, &p, &detail); err != nil { + return nil, fmt.Errorf("store: scan eval case: %w", err) + } + c.Pass = p != 0 + c.Detail = detail.String + out = append(out, c) + } + return out, rows.Err() +} + +func scanEvalRun(row interface{ Scan(...any) error }) (*EvalRunRow, error) { + var ( + r EvalRunRow + sessionID sql.NullString + passInt int + startedAt time.Time + ) + if err := row.Scan(&r.ID, &r.SkillName, &r.SkillCommit, &r.Suite, &sessionID, + &startedAt, &r.Passed, &r.Failed, &passInt); err != nil { + return nil, fmt.Errorf("store: scan eval run: %w", err) + } + r.SessionID = sessionID.String + r.StartedAt = startedAt.UTC() + r.Pass = passInt != 0 + return &r, nil +} diff --git a/internal/ops/store/eval_runs_test.go b/internal/ops/store/eval_runs_test.go new file mode 100644 index 0000000..75c63a0 --- /dev/null +++ b/internal/ops/store/eval_runs_test.go @@ -0,0 +1,66 @@ +package store + +import ( + "context" + "testing" +) + +// TestEvalRuns_RoundTrip pins the write/read path: a run and its case rows land +// atomically, read back newest-first, and filter by skill commit. +func TestEvalRuns_RoundTrip(t *testing.T) { + st := openTestStore(t) + ctx := context.Background() + + // Two runs of the same skill at different commits: the older fails, the + // newer (post-fix) passes — the article's fail→pass. + if _, err := st.PutEvalRun(ctx, &EvalRunRow{ + SkillName: "triage-issue", SkillCommit: "aaaaaaa", Suite: "triage-correctness", + Passed: 0, Failed: 1, Pass: false, + Cases: []EvalCaseRow{{Suite: "triage-correctness", Case: "needs-info", Pass: false, Detail: "text: missing needs-info"}}, + }); err != nil { + t.Fatalf("put failing run: %v", err) + } + id2, err := st.PutEvalRun(ctx, &EvalRunRow{ + SkillName: "triage-issue", SkillCommit: "bbbbbbb", Suite: "triage-correctness", + Passed: 1, Failed: 0, Pass: true, + Cases: []EvalCaseRow{{Suite: "triage-correctness", Case: "needs-info", Pass: true}}, + }) + if err != nil { + t.Fatalf("put passing run: %v", err) + } + if id2 == 0 { + t.Fatal("expected a non-zero run id") + } + + all, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue"}) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(all) != 2 { + t.Fatalf("want 2 runs, got %d", len(all)) + } + // Newest-first: the passing run leads. + if !all[0].Pass || all[0].SkillCommit != "bbbbbbb" { + t.Errorf("newest run wrong: %+v", all[0]) + } + if len(all[0].Cases) != 1 || !all[0].Cases[0].Pass { + t.Errorf("case rows did not round-trip: %+v", all[0].Cases) + } + if len(all[1].Cases) != 1 || all[1].Cases[0].Detail != "text: missing needs-info" { + t.Errorf("failing case detail did not round-trip: %+v", all[1].Cases) + } + + // Commit filter isolates the passing run. + byCommit, _ := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue", SkillCommit: "bbbbbbb"}) + if len(byCommit) != 1 || !byCommit[0].Pass { + t.Errorf("commit filter: want 1 passing, got %+v", byCommit) + } +} + +// TestEvalRuns_RequiresRow pins the nil guard. +func TestEvalRuns_RequiresRow(t *testing.T) { + st := openTestStore(t) + if _, err := st.PutEvalRun(context.Background(), nil); err == nil { + t.Error("expected an error for a nil eval run") + } +} diff --git a/internal/ops/store/migrations/0007_session_meta_outcome.sql b/internal/ops/store/migrations/0007_session_meta_outcome.sql new file mode 100644 index 0000000..8c9062c --- /dev/null +++ b/internal/ops/store/migrations/0007_session_meta_outcome.sql @@ -0,0 +1,6 @@ +-- 0007: nullable session-level outcome on the unified session read model. +-- The worst-of-spans rollup of each session's TOOL/SKILL qvr.outcome +-- attribute (success/failure/blocked), derived at derive time. NULL means no +-- span carried an outcome — consumers render "unknown", never a fabricated +-- success. Regenerated by `qvr audit rederive` after the v8 deriver bump. +ALTER TABLE session_meta ADD COLUMN outcome TEXT; diff --git a/internal/ops/store/migrations/0008_annotations.sql b/internal/ops/store/migrations/0008_annotations.sql new file mode 100644 index 0000000..e35f90b --- /dev/null +++ b/internal/ops/store/migrations/0008_annotations.sql @@ -0,0 +1,19 @@ +-- 0008: human annotations on captured sessions — the outer-loop feedback +-- signal (a reviewer's verdict on what a skill actually did, with a reason). +-- Unlike spans and session_meta, an annotation is NOT a derived projection: +-- `qvr audit rederive` regenerates those from raw, which would erase a human +-- edit written there. The verdict therefore lives in its own table that +-- derivation never rewrites. A NULL skill is a whole-session annotation; a set +-- skill scopes the verdict to one skill within the session. Append-only: a +-- session can accrue several verdicts over time (the latest by created_at wins +-- for consumers that want one). +CREATE TABLE IF NOT EXISTS annotations ( + session_id TEXT NOT NULL, + skill TEXT, + outcome TEXT NOT NULL, + note TEXT, + author TEXT, + created_at DATETIME NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_annotations_session ON annotations(session_id); +CREATE INDEX IF NOT EXISTS idx_annotations_skill ON annotations(skill); diff --git a/internal/ops/store/migrations/0009_eval_runs.sql b/internal/ops/store/migrations/0009_eval_runs.sql new file mode 100644 index 0000000..340ea8c --- /dev/null +++ b/internal/ops/store/migrations/0009_eval_runs.sql @@ -0,0 +1,27 @@ +-- 0009: eval results, keyed by {skill_name, skill_commit} — the lock × evidence +-- join that is qvr's moat. An eval run grades a captured session against the +-- skill's evals.yaml; the verdict is pinned to the exact locked commit that +-- ran, so lineage can show "at commit abc the safety suite failed; at def it +-- passed". Per-case rows carry the granular pass/fail + reason. +CREATE TABLE IF NOT EXISTS eval_runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + skill_name TEXT NOT NULL, + skill_commit TEXT NOT NULL, + suite TEXT NOT NULL, -- requested suite, or '*' for all + session_id TEXT, -- the graded session + started_at DATETIME NOT NULL, + passed INTEGER NOT NULL, + failed INTEGER NOT NULL, + pass INTEGER NOT NULL -- 1 only when every case passed +); +CREATE INDEX IF NOT EXISTS idx_eval_runs_skill ON eval_runs(skill_name, skill_commit); +CREATE INDEX IF NOT EXISTS idx_eval_runs_started ON eval_runs(started_at); + +CREATE TABLE IF NOT EXISTS eval_case_results ( + eval_run_id INTEGER NOT NULL, + suite TEXT NOT NULL, + case_name TEXT NOT NULL, + pass INTEGER NOT NULL, + detail TEXT +); +CREATE INDEX IF NOT EXISTS idx_eval_case_run ON eval_case_results(eval_run_id); diff --git a/internal/ops/store/scan.go b/internal/ops/store/scan.go index 907e025..6853183 100644 --- a/internal/ops/store/scan.go +++ b/internal/ops/store/scan.go @@ -28,3 +28,11 @@ func nullInt64Ptr(v sql.NullInt64) *int64 { n := v.Int64 return &n } + +// boolToInt maps a Go bool to SQLite's 0/1 integer encoding. +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} diff --git a/internal/ops/store/session_meta.go b/internal/ops/store/session_meta.go index c1d7f91..fc4726f 100644 --- a/internal/ops/store/session_meta.go +++ b/internal/ops/store/session_meta.go @@ -32,8 +32,12 @@ type SessionMetaRow struct { Skills []string `json:"skills,omitempty"` // distinct skill names, first-use order // Session token totals. nil = the native store reported no usage on that // side (serialized as absent, rendered n/a — never 0). - TokensIn *int64 `json:"tokens_in,omitempty"` - TokensOut *int64 `json:"tokens_out,omitempty"` + TokensIn *int64 `json:"tokens_in,omitempty"` + TokensOut *int64 `json:"tokens_out,omitempty"` + // Outcome is the session-level rollup of its TOOL/SKILL spans' qvr.outcome + // (worst-of: failure > blocked > success). "" = no span carried an outcome + // (serialized as absent, rendered "unknown" — never success). + Outcome string `json:"outcome,omitempty"` DeriverVersion int `json:"deriver_version"` DerivedAt time.Time `json:"derived_at"` } @@ -70,25 +74,37 @@ type SessionMetaFilter struct { const sessionMetaColumns = `session_id, agent_name, source_session_id, source_path, working_directory, git_branch, model, title, started_ms, ended_ms, - turns, tools, skills, tokens_in, tokens_out, deriver_version, derived_at` + turns, tools, skills, tokens_in, tokens_out, outcome, deriver_version, derived_at` -// sessionMetaColumnsLegacy reads a DB that predates migration 0006 (opened -// read-only by `qvr ui --no-discover`, so the migration can't apply): the -// token columns come back as NULL (n/a) instead of failing the whole list. +// sessionMetaColumnsNoOutcome reads a DB at schema 0006 (tokens present, no +// outcome yet — e.g. opened read-only before migration 0007 can apply): the +// outcome column comes back NULL (unknown) instead of failing the whole list. +const sessionMetaColumnsNoOutcome = `session_id, agent_name, source_session_id, source_path, + working_directory, git_branch, model, title, started_ms, ended_ms, + turns, tools, skills, tokens_in, tokens_out, NULL, deriver_version, derived_at` + +// sessionMetaColumnsLegacy reads a DB that predates migration 0006: both the +// token columns and outcome come back NULL (n/a) instead of failing the list. const sessionMetaColumnsLegacy = `session_id, agent_name, source_session_id, source_path, working_directory, git_branch, model, title, started_ms, ended_ms, - turns, tools, skills, NULL, NULL, deriver_version, derived_at` + turns, tools, skills, NULL, NULL, NULL, deriver_version, derived_at` // staleTokensSchema reports the pre-0006 "tokens columns missing" condition. func staleTokensSchema(err error) bool { return err != nil && strings.Contains(err.Error(), "no such column: tokens_in") } +// staleOutcomeSchema reports the pre-0007 "outcome column missing" condition (a +// DB already at 0006, so tokens resolve but outcome doesn't). +func staleOutcomeSchema(err error) bool { + return err != nil && strings.Contains(err.Error(), "no such column: outcome") +} + const upsertSessionMetaSQL = `INSERT OR REPLACE INTO session_meta( session_id, agent_name, source_session_id, source_path, working_directory, git_branch, model, title, started_ms, ended_ms, turns, tools, skills, - tokens_in, tokens_out, deriver_version, derived_at -) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)` + tokens_in, tokens_out, outcome, deriver_version, derived_at +) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)` // ReplaceSessionDerivation atomically replaces a session's whole derived // projection — its session_meta row and all its spans — in one tx. This is the @@ -125,7 +141,7 @@ func (s *sqliteStore) ReplaceSessionDerivation(ctx context.Context, meta *Sessio nullableString(meta.Model), nullableString(meta.Title), meta.StartedMs, meta.EndedMs, meta.Turns, meta.Tools, skills, nullableInt64(meta.TokensIn), nullableInt64(meta.TokensOut), - meta.DeriverVersion, meta.DerivedAt.UTC(), + nullableString(meta.Outcome), meta.DeriverVersion, meta.DerivedAt.UTC(), ); err != nil { return fmt.Errorf("store: upsert session meta: %w", err) } @@ -135,26 +151,27 @@ func (s *sqliteStore) ReplaceSessionDerivation(ctx context.Context, meta *Sessio // ListSessionMeta returns unified session rows newest-first (by start time), // or by total tokens when the filter asks for it. func (s *sqliteStore) ListSessionMeta(ctx context.Context, f *SessionMetaFilter) ([]*SessionMetaRow, error) { - out, err := s.listSessionMeta(ctx, f, false) + // Cascade down the schema as columns go missing on read-only opens that + // predate a migration: full → no-outcome (0006) → legacy (pre-0006). + out, err := s.listSessionMeta(ctx, f, sessionMetaColumns, true) + if staleOutcomeSchema(err) { + out, err = s.listSessionMeta(ctx, f, sessionMetaColumnsNoOutcome, true) + } if staleTokensSchema(err) { - return s.listSessionMeta(ctx, f, true) + out, err = s.listSessionMeta(ctx, f, sessionMetaColumnsLegacy, false) } return out, err } -func (s *sqliteStore) listSessionMeta(ctx context.Context, f *SessionMetaFilter, legacy bool) ([]*SessionMetaRow, error) { +func (s *sqliteStore) listSessionMeta(ctx context.Context, f *SessionMetaFilter, cols string, tokenSort bool) ([]*SessionMetaRow, error) { where, args := sessionMetaWhere(f) - cols := sessionMetaColumns - if legacy { - cols = sessionMetaColumnsLegacy - } q := `SELECT ` + cols + ` FROM session_meta` if len(where) > 0 { q += " WHERE " + strings.Join(where, " AND ") } // On a legacy schema a token sort degrades to the default order — every // row's tokens read n/a, so there is no token order to honor. - if f != nil && f.SortByTokens && !legacy { + if f != nil && f.SortByTokens && tokenSort { q += ` ORDER BY (tokens_in IS NULL AND tokens_out IS NULL), (COALESCE(tokens_in,0)+COALESCE(tokens_out,0)) DESC, started_ms DESC` } else { @@ -185,6 +202,9 @@ func (s *sqliteStore) listSessionMeta(ctx context.Context, f *SessionMetaFilter, // GetSessionMeta returns one session's unified row, or nil when absent. func (s *sqliteStore) GetSessionMeta(ctx context.Context, sessionID uuid.UUID) (*SessionMetaRow, error) { m, err := s.getSessionMeta(ctx, sessionID, sessionMetaColumns) + if staleOutcomeSchema(err) { + m, err = s.getSessionMeta(ctx, sessionID, sessionMetaColumnsNoOutcome) + } if staleTokensSchema(err) { return s.getSessionMeta(ctx, sessionID, sessionMetaColumnsLegacy) } @@ -248,16 +268,18 @@ func scanSessionMeta(row interface{ Scan(...any) error }) (*SessionMetaRow, erro srcSession, srcPath, wd sql.NullString branch, model, title, skillsJS sql.NullString tokensIn, tokensOut sql.NullInt64 + outcome sql.NullString derivedAt time.Time ) if err := row.Scan(&sid, &m.AgentName, &srcSession, &srcPath, &wd, &branch, &model, &title, &m.StartedMs, &m.EndedMs, - &m.Turns, &m.Tools, &skillsJS, &tokensIn, &tokensOut, + &m.Turns, &m.Tools, &skillsJS, &tokensIn, &tokensOut, &outcome, &m.DeriverVersion, &derivedAt); err != nil { return nil, fmt.Errorf("store: scan session meta: %w", err) } m.TokensIn = nullInt64Ptr(tokensIn) m.TokensOut = nullInt64Ptr(tokensOut) + m.Outcome = outcome.String id, err := uuid.Parse(sid) if err != nil { return nil, fmt.Errorf("store: bad session_meta session_id %q: %w", sid, err) diff --git a/internal/ops/store/session_meta_test.go b/internal/ops/store/session_meta_test.go index 7126ea5..b69649d 100644 --- a/internal/ops/store/session_meta_test.go +++ b/internal/ops/store/session_meta_test.go @@ -205,6 +205,35 @@ func i64Equal(a, b *int64) bool { return *a == *b } +// TestSessionMetaOutcome_RoundTrip pins the nullable outcome column: a set +// verdict reads back, and "" (no span carried an outcome) reads back as "". +func TestSessionMetaOutcome_RoundTrip(t *testing.T) { + st := openTestStore(t) + ctx := context.Background() + + withOutcome, unknown := uuid.New(), uuid.New() + o := metaFor(withOutcome, "claude-code", 1000, "code-review") + o.Outcome = "failure" + if err := st.ReplaceSessionDerivation(ctx, o, nil); err != nil { + t.Fatalf("derive with outcome: %v", err) + } + if err := st.ReplaceSessionDerivation(ctx, metaFor(unknown, "codex", 2000, "tdd"), nil); err != nil { + t.Fatalf("derive without outcome: %v", err) + } + + got, err := st.GetSessionMeta(ctx, withOutcome) + if err != nil || got == nil { + t.Fatalf("get: %v %v", got, err) + } + if got.Outcome != "failure" { + t.Errorf("outcome = %q, want failure", got.Outcome) + } + got2, _ := st.GetSessionMeta(ctx, unknown) + if got2 == nil || got2.Outcome != "" { + t.Errorf("outcome = %q, want \"\" (unknown)", got2.Outcome) + } +} + // TestGetSessionMeta_AbsentIsNil pins the no-row contract: nil, not an error. func TestGetSessionMeta_AbsentIsNil(t *testing.T) { st := openTestStore(t) diff --git a/internal/ops/store/sqlite.go b/internal/ops/store/sqlite.go index e919aba..77fa03f 100644 --- a/internal/ops/store/sqlite.go +++ b/internal/ops/store/sqlite.go @@ -145,6 +145,12 @@ func (s *sqliteStore) DeleteSession(ctx context.Context, sessionID uuid.UUID) (i if _, err := tx.ExecContext(ctx, `DELETE FROM trace_cursors WHERE session_id = ?`, id); err != nil { return 0, fmt.Errorf("store: delete session cursor: %w", err) } + // Annotations are human verdicts on this session; a hard session delete + // removes them too (nothing left to annotate). A rederive does NOT reach + // here, so verdicts survive re-derivation — only a full purge clears them. + if _, err := tx.ExecContext(ctx, `DELETE FROM annotations WHERE session_id = ?`, id); err != nil { + return 0, fmt.Errorf("store: delete session annotations: %w", err) + } if err := tx.Commit(); err != nil { return 0, fmt.Errorf("store: delete session commit: %w", err) } diff --git a/internal/ops/store/store.go b/internal/ops/store/store.go index ceea15c..2f5b741 100644 --- a/internal/ops/store/store.go +++ b/internal/ops/store/store.go @@ -129,6 +129,20 @@ type Store interface { PutLockSnapshots(ctx context.Context, sessionID uuid.UUID, rows []*LockSnapshotRow) error GetLockSnapshots(ctx context.Context, sessionID uuid.UUID) (map[string]*LockSnapshotRow, error) + // PutAnnotation appends a human verdict on a session (the outer-loop + // feedback signal); ListAnnotations reads them back, newest-first. Unlike + // spans/session_meta, annotations are NOT regenerated by rederive — they are + // durable human input. See annotations.go + migration 0008. + PutAnnotation(ctx context.Context, a *AnnotationRow) error + ListAnnotations(ctx context.Context, f *AnnotationFilter) ([]*AnnotationRow, error) + + // PutEvalRun records one eval run (verdict + per-case rows), keyed by + // {skill_name, skill_commit} — the lock × evidence join. ListEvalRuns reads + // them back newest-first for the gate and for lineage. See eval_runs.go + + // migration 0009. + PutEvalRun(ctx context.Context, r *EvalRunRow) (int64, error) + ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*EvalRunRow, error) + // DeleteRawBefore sweeps raw rows captured before cutoff. Returns the // number of rows deleted. DeleteRawBefore(ctx context.Context, cutoff time.Time) (int64, error) From 9b007fda7e1c37c2edbfb18a87e7538abff730f0 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Tue, 16 Jun 2026 23:41:18 -0400 Subject: [PATCH 02/10] =?UTF-8?q?feat(skills):=20evolve-skill=20=E2=80=94?= =?UTF-8?q?=20the=20self-improvement=20orchestrator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A qvr-installable skill that runs the outer loop over the CLI: observe (qvr audit) -> edit (qvr edit) -> re-run -> gate (qvr ops eval run) -> open a PR. Evidence-gated; never auto-merges or auto-installs. Ships a CI recipe for running the loop unattended. --- skills/evolve-skill/SKILL.md | 183 ++++++++++++++++++++ skills/evolve-skill/references/ci-recipe.md | 85 +++++++++ 2 files changed, 268 insertions(+) create mode 100644 skills/evolve-skill/SKILL.md create mode 100644 skills/evolve-skill/references/ci-recipe.md diff --git a/skills/evolve-skill/SKILL.md b/skills/evolve-skill/SKILL.md new file mode 100644 index 0000000..96a9810 --- /dev/null +++ b/skills/evolve-skill/SKILL.md @@ -0,0 +1,183 @@ +--- +name: evolve-skill +description: > + Runs a self-improvement loop on an installed skill: read what it actually did + (auto outcomes + human verdicts), draft a fix to its SKILL.md, and keep the fix + only if qvr's eval gate shows the score improve. Use when a user wants a skill + to get better from feedback over time — e.g. "improve my triage skill from the + feedback", "evolve this skill", "close the loop on skill quality", "auto-tune a + skill against its evals", or "why did this skill regress and how do I fix it". + Drives qvr audit + qvr ops; never auto-installs or auto-merges — it opens a PR + for a human to merge. Experimental. +metadata: + author: quiver-playground + version: "1.0.0" +--- + +# Evolve a skill with qvr's eval gate + +This is the **outer loop** of skill self-improvement, run as an agent over qvr's +CLI. The **inner loop** already happened: agents used the skill, and qvr captured +every run — lock-verified to the exact installed commit — via `qvr audit`. Your +job here is to read those runs plus any human verdicts, draft an improvement to +the skill's `SKILL.md`, and **prove it helps before it ships** by re-running the +skill and grading it with `qvr ops eval`. A candidate that doesn't improve the +eval score is discarded; one that does is opened as a pull request for a human to +merge. + +The discipline is deliberate: qvr is **evidence-gated**. Skills do not auto-evolve +from raw model confidence — every change passes a deterministic eval keyed to the +skill's locked commit, because curated-with-evidence beats blind self-generation. + +## When to use this + +- A skill has been running for a while and the user wants it to improve from how + it actually performed (failures, human corrections), not from a guess. +- The user has recorded verdicts with `qvr audit annotate` and wants them acted on. +- A skill regressed after an edit and the user wants the loop to find and fix it. + +This **changes a skill's content** behind an eval gate. It is not observability +(that's `trace-skill-activity`), supply-chain verification +(`verify-skill-supply-chain`), or install management (`onboard-skills`). + +## Prerequisites + +- `qvr audit` capturing the target skill's runs (`qvr audit enable` + `discover`; + see `trace-skill-activity`). +- An **`evals.yaml`** beside the skill's `SKILL.md` — qvr's own manifest of + suites/cases graded deterministically over the captured trace (no model calls). + If the skill has none, write one first: encode the behaviors that matter, + starting with any run a human flagged as wrong (a regression case). + +A minimal `evals.yaml`: + +```yaml +version: 1 +suites: + - name: triage-correctness + cases: + - name: ambiguous-feature-needs-info + graders: + - type: outcome # the captured run did not error + expect: success + - type: text # and produced the right label + on: final_message + contains: ["needs-info"] + reject: ["ready-to-implement"] +``` + +Grader types available (all deterministic, all over the trace qvr already has): +`outcome`, `text`, `tool_sequence`, `tool_constraint`, `skill_invocation`, +`behavior`. Semantic ("is this explanation clear?") judgement is **not** a core +grader — delegate that to a judge skill and fold its verdict back in as an +annotation; keep the gate itself model-free. + +## The loop + +### 1. Observe — read how the skill actually did + +``` +qvr audit logs --kind SKILL --since 30d # where the skill fired +qvr audit sessions --skill # its runs, with auto outcomes +qvr audit annotations --skill # human verdicts ("bad: …, because …") +qvr audit export --session -o run.jsonl # one run verbatim, for close reading +``` + +Auto **outcome** (`success`/`failure`/`blocked`) tells you *that* a run went +wrong; a human **annotation** tells you *why*. Cluster the failures + notes into +one concrete weakness — the smallest change that would have fixed the flagged +runs. Record a verdict yourself if one is missing: + +``` +qvr audit annotate --skill --outcome bad --note "why it was wrong" +``` + +### 2. Update — draft the fix + +Eject a writable copy and edit it: + +``` +qvr edit # promote the symlinked skill to a real dir +# …edit /SKILL.md to address the weakness… +qvr diff # review the change +``` + +Make the **smallest** change that addresses the observed failures. Add the +flagged input as a regression case in `evals.yaml` if it isn't already one. + +### 3. Roll out the candidate — generate fresh evidence + +Run the edited skill on the held-out / regression inputs (this is a real agent +run — execution stays with the agent, not qvr), then let qvr capture it: + +``` +# …invoke the skill on the regression input(s)… +qvr audit discover --agent # capture the fresh run(s) +``` + +### 4. Evaluate — the gate + +``` +qvr ops eval run --suite +``` + +This grades the skill's most recent captured run against the suite, **keyed to +the locked commit**, and exits non-zero if any case fails. **Keep the edit only +if the score strictly improved** — i.e. cases that failed before now pass. If it +did not improve, discard the change (`git checkout` the ejected dir or revert) +and go back to step 1 with what you learned. Inspect the trail any time: + +``` +qvr ops lineage # eval verdicts + annotations over commits +``` + +### 5. Ship — open a PR, never merge + +Once the gate passes, publish/commit the change on a branch and **open a pull +request for a human to review and merge**. Do **not** merge it yourself, and do +**not** auto-install the new version — a passing eval clears it for review, not +for an automatic landing. Summarize: the weakness observed, the change made, and +the eval score before → after. + +``` +qvr ops promote # confirms the locked commit has a passing eval +``` + +`qvr ops promote` is the gate made explicit: it refuses (non-zero exit) unless +the locked commit is backed by a passing eval, so a CI step or the loop can +branch on it. + +## Running it unattended + +The same loop runs on a schedule via CI — a scheduled job that proposes a PR, +plus a PR check that fails when `qvr ops eval run` regresses. See +[`references/ci-recipe.md`](references/ci-recipe.md). The schedule and the agent +runner live outside qvr (qvr is the substrate, not a scheduler); qvr provides the +capture, the gate, and the lineage. + +## Gotchas + +- **Evidence-gated, always.** A change ships only behind a passing eval keyed to + its commit. No eval improvement ⇒ no change. Resist "the model is confident." +- **Never auto-merge, never auto-install.** Open a PR; a human merges. A passing + eval is a green light to *review*, not to land. +- **Annotations are the "why."** Auto outcome alone can't tell a correct-but- + unhelpful run from a good one; the human verdict carries the intent the loop + optimizes toward. +- **Keep the gate model-free.** Deterministic graders over the real trace are the + core; push LLM judgement to a separate judge skill so the gate stays + reproducible and cheap. +- **Experimental.** `qvr ops` / `qvr audit` command names and the `evals.yaml` + schema may change; pin your qvr version if you script the loop. + +## Troubleshooting + +- *`skill … has no evals.yaml`* — write one beside `SKILL.md` (see Prerequisites). +- *`no captured sessions for skill … to evaluate`* — run the skill, then + `qvr audit discover`, before `qvr ops eval run`. +- *The eval passes but the skill still feels wrong* — your suite is under- + specifying the behavior. Add the failing case; the loop is only as good as its + evals. +- *Lineage shows no commit move* — the edited skill wasn't re-committed, so the + eval is keyed to the same commit. Commit the change (or `qvr edit`/`publish`) + so the gate distinguishes before from after. diff --git a/skills/evolve-skill/references/ci-recipe.md b/skills/evolve-skill/references/ci-recipe.md new file mode 100644 index 0000000..3dd69bc --- /dev/null +++ b/skills/evolve-skill/references/ci-recipe.md @@ -0,0 +1,85 @@ +# CI recipe: the outer loop on a schedule + +qvr is the substrate — capture, the eval gate, and lineage — not a scheduler or +an agent runner. To run the self-improvement loop unattended you wire two CI +jobs around qvr's CLI. Both are illustrative GitHub Actions; adapt to any +scheduler + cloud-agent platform. + +## Job A — PR regression gate (run on every pull request) + +Fails the PR when a changed skill's evals regress. This is the cheap, always-on +half: it never edits anything, it just runs the deterministic gate. + +```yaml +name: skill-evals +on: pull_request +jobs: + eval-gate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: curl -fsSL https://quiver.sh/install | sh # install qvr + - run: qvr audit enable && qvr audit discover + # Grade each skill that ships an evals.yaml; non-zero exit fails the job. + - run: | + for s in $(qvr ls --output json | jq -r '.[].name'); do + qvr ops eval run "$s" || exit 1 + done +``` + +The gate grades the skill's most recent **captured** run — so the PR's CI must +first exercise the skill (your existing test/integration step), then +`qvr audit discover` captures it, then `qvr ops eval run` judges it. No agent is +run by qvr itself. + +## Job B — scheduled improver (run on a cron) + +Proposes improvements. This half needs an **agent** (a cloud coding agent / the +`evolve-skill` loop) and a **scheduler** — both outside qvr. The agent drives the +exact CLI loop from `SKILL.md`: observe → update → roll out → evaluate → open a +PR. + +```yaml +name: evolve-skills +on: + schedule: + - cron: "0 6 * * *" # daily; tune to how fast feedback accrues +jobs: + evolve: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: curl -fsSL https://quiver.sh/install | sh + - run: qvr audit enable && qvr audit discover + # Hand the loop to your agent runner. It reads `qvr audit annotations` + + # `qvr audit export`, edits the skill behind `qvr edit`, re-runs it, + # gates on `qvr ops eval run`, and opens a PR. It MUST NOT merge. + - run: your-agent-runner run --skill evolve-skill --target "$SKILL" + env: + SKILL: triage-issue +``` + +## Rules the scheduled job must honor + +- **Open a PR; never merge.** A passing eval clears a change for human review, + not for an automatic landing. The job's output is a PR, full stop. +- **Never auto-install.** Do not `qvr add` the candidate into anyone's project as + part of the loop. +- **Gate every change.** No PR without a `qvr ops eval run` pass recorded for the + candidate's commit — confirm with `qvr ops promote ` (non-zero exit + blocks). +- **No AI attribution in the PR.** Title and body describe the change and the + before→after eval score; they carry no assistant byline or footer. + +## What qvr provides vs what you provide + +| Step | Provided by | +|------|-------------| +| Capture runs, lock-verified | **qvr** (`audit discover`) | +| Human + auto feedback | **qvr** (`audit annotate` / outcome) | +| Eval gate, keyed to commit | **qvr** (`ops eval run`) | +| Lineage (score over commits) | **qvr** (`ops lineage`) | +| Promotion gate | **qvr** (`ops promote`) | +| Scheduling (cron) | **you / CI** | +| Running the agent + drafting the diff | **you / a cloud agent** | +| Opening the PR | **you / CI** | From abe5b42ccf9f5f6e77703c6124b02ed4249e0e36 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Tue, 16 Jun 2026 23:50:29 -0400 Subject: [PATCH 03/10] fix(ops): address greptile review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ops promote: surface store errors from latestPassingEval instead of swallowing them — a transient DB failure no longer reads as "no passing eval" and silently blocks/misleads a CI gate (P1). - ops eval: evalRunRow takes the suite as a parameter rather than closing over the flag global, so it is pure and testable in isolation. - eval evidence: lastAssistantText now requires role=="assistant" before returning content, guarding against a future multi-message output capture. - eval evidence: Evidence.Tools now counts the same tool-sequence the tool_constraint grader walks, so a `maxTools` ceiling means the same thing under behavior and tool_constraint graders. - migration 0009: eval_case_results.eval_run_id is now a FK to eval_runs(id) ON DELETE CASCADE (FK enforcement is on), so case rows can't be orphaned by a future eval-run prune. --- cmd/ops_eval.go | 9 ++++---- cmd/ops_promote.go | 22 ++++++++++++------- internal/eval/evidence.go | 9 ++++++-- .../ops/store/migrations/0009_eval_runs.sql | 2 +- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/cmd/ops_eval.go b/cmd/ops_eval.go index 1d1c04b..d9fc5e5 100644 --- a/cmd/ops_eval.go +++ b/cmd/ops_eval.go @@ -85,7 +85,7 @@ func runOpsEval(cmd *cobra.Command, args []string) error { return err } - if _, err := s.PutEvalRun(cmd.Context(), evalRunRow(rs, result)); err != nil { + if _, err := s.PutEvalRun(cmd.Context(), evalRunRow(rs, result, evalSuite)); err != nil { return fmt.Errorf("record eval run: %w", err) } @@ -99,9 +99,10 @@ func runOpsEval(cmd *cobra.Command, args []string) error { return nil } -// evalRunRow flattens a run result into the persisted row + its case rows. -func evalRunRow(rs *resolvedSkill, r *eval.RunResult) *store.EvalRunRow { - suite := evalSuite +// evalRunRow flattens a run result into the persisted row + its case rows. The +// requested suite is passed in (not read from the flag global) so the function +// is pure and testable in isolation. +func evalRunRow(rs *resolvedSkill, r *eval.RunResult, suite string) *store.EvalRunRow { if suite == "" { suite = "*" } diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go index 97eca56..2e6e1a6 100644 --- a/cmd/ops_promote.go +++ b/cmd/ops_promote.go @@ -54,7 +54,10 @@ func runOpsPromote(cmd *cobra.Command, args []string) error { } defer s.Close() - passing := latestPassingEval(cmd, s, rs) + passing, err := latestPassingEval(cmd, s, rs) + if err != nil { + return fmt.Errorf("look up eval history for %s: %w", rs.Name, err) + } decision := promoteDecision(rs, passing) if outputFormat == "json" { @@ -98,20 +101,23 @@ func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow) promoteDecisi } // latestPassingEval returns the newest passing eval run for the skill's locked -// commit, or nil. A skill with no locked commit ("") can't be evidence-gated by -// commit, so this returns nil (promotion then requires --force-no-eval). -func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) *store.EvalRunRow { +// commit, or nil when there is none. A skill with no locked commit ("") can't be +// evidence-gated by commit, so it returns nil (promotion then requires +// --force-no-eval). A store error is propagated, NOT swallowed — otherwise a +// transient DB failure would read as "no passing eval" and silently block (or +// mislead) a CI gate. +func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) (*store.EvalRunRow, error) { if rs.Commit == "" { - return nil + return nil, nil } runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit}) if err != nil { - return nil + return nil, err } for _, r := range runs { // newest-first if r.Pass { - return r + return r, nil } } - return nil + return nil, nil } diff --git a/internal/eval/evidence.go b/internal/eval/evidence.go index 11a4cdd..8c9bbbb 100644 --- a/internal/eval/evidence.go +++ b/internal/eval/evidence.go @@ -34,7 +34,6 @@ func BuildEvidence(meta *store.SessionMetaRow, spans []*store.SpanRow) *Evidence SessionID: meta.SessionID.String(), Outcome: meta.Outcome, Skills: append([]string(nil), meta.Skills...), - Tools: int(meta.Tools), Turns: int(meta.Turns), DurationMs: meta.DurationMs(), } @@ -57,6 +56,12 @@ func BuildEvidence(meta *store.SessionMetaRow, spans []*store.SpanRow) *Evidence } } e.FinalMessage = lastLLMText + // Tools counts the same tool-named spans the tool_constraint grader walks + // (ToolSequence, which includes the skill-load call), so a `maxTools` ceiling + // means the same thing whether an author writes it under a behavior or a + // tool_constraint grader. (Distinct from meta.Tools, which excludes SKILL + // spans and feeds the session list, not the gate.) + e.Tools = len(e.ToolSequence) return e } @@ -87,7 +92,7 @@ func lastAssistantText(attrs map[string]any) string { return "" } for _, m := range slices.Backward(msgs) { - if m.Content != "" { + if m.Role == "assistant" && m.Content != "" { return m.Content } } diff --git a/internal/ops/store/migrations/0009_eval_runs.sql b/internal/ops/store/migrations/0009_eval_runs.sql index 340ea8c..de12808 100644 --- a/internal/ops/store/migrations/0009_eval_runs.sql +++ b/internal/ops/store/migrations/0009_eval_runs.sql @@ -18,7 +18,7 @@ CREATE INDEX IF NOT EXISTS idx_eval_runs_skill ON eval_runs(skill_name, skill_co CREATE INDEX IF NOT EXISTS idx_eval_runs_started ON eval_runs(started_at); CREATE TABLE IF NOT EXISTS eval_case_results ( - eval_run_id INTEGER NOT NULL, + eval_run_id INTEGER NOT NULL REFERENCES eval_runs(id) ON DELETE CASCADE, suite TEXT NOT NULL, case_name TEXT NOT NULL, pass INTEGER NOT NULL, From 99ba8fa92e39bb39087b55362155f82dc277d47f Mon Sep 17 00:00:00 2001 From: Rakshith Date: Tue, 16 Jun 2026 23:58:17 -0400 Subject: [PATCH 04/10] fix(ops): promoteDecision takes flags as parameters Mirror the evalRunRow refactor: promoteDecision now receives force/reason as explicit arguments instead of reading the Cobra flag globals, so the pure gate logic is independently testable and its sub-tests no longer mutate package state. --- cmd/ops_promote.go | 8 ++++---- cmd/ops_test.go | 11 ++++------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go index 2e6e1a6..db7a508 100644 --- a/cmd/ops_promote.go +++ b/cmd/ops_promote.go @@ -58,7 +58,7 @@ func runOpsPromote(cmd *cobra.Command, args []string) error { if err != nil { return fmt.Errorf("look up eval history for %s: %w", rs.Name, err) } - decision := promoteDecision(rs, passing) + decision := promoteDecision(rs, passing, promoteForce, promoteReason) if outputFormat == "json" { if err := printer.JSON(decision); err != nil { @@ -84,13 +84,13 @@ type promoteDecisionResult struct { Message string `json:"message"` } -func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow) promoteDecisionResult { - d := promoteDecisionResult{Skill: rs.Name, Commit: shortCommit(rs.Commit), Reason: promoteReason} +func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow, force bool, reason string) promoteDecisionResult { + d := promoteDecisionResult{Skill: rs.Name, Commit: shortCommit(rs.Commit), Reason: reason} switch { case passing != nil: d.Promoted = true d.Message = fmt.Sprintf("%s @ %s is backed by a passing eval (run #%d) — clear to promote", rs.Name, shortCommit(rs.Commit), passing.ID) - case promoteForce: + case force: d.Promoted = true d.Forced = true d.Message = fmt.Sprintf("%s @ %s promoted WITHOUT a passing eval (--force-no-eval)", rs.Name, shortCommit(rs.Commit)) diff --git a/cmd/ops_test.go b/cmd/ops_test.go index fb37894..f972e2c 100644 --- a/cmd/ops_test.go +++ b/cmd/ops_test.go @@ -87,29 +87,26 @@ func TestOpsPromote_Gate(t *testing.T) { } // TestPromoteDecision unit-tests the pure gate logic across its three branches. +// Inputs are passed explicitly (no flag globals), so sub-tests stay independent. func TestPromoteDecision(t *testing.T) { rs := &resolvedSkill{Name: "guard-tests", Commit: "abc1234"} t.Run("passing eval clears it", func(t *testing.T) { - promoteForce = false - d := promoteDecision(rs, &store.EvalRunRow{ID: 7, Pass: true}) + d := promoteDecision(rs, &store.EvalRunRow{ID: 7, Pass: true}, false, "") if !d.Promoted || d.Forced { t.Errorf("want promoted, not forced: %+v", d) } }) t.Run("no eval refuses", func(t *testing.T) { - promoteForce = false - if d := promoteDecision(rs, nil); d.Promoted { + if d := promoteDecision(rs, nil, false, ""); d.Promoted { t.Errorf("want refusal, got %+v", d) } }) t.Run("force overrides", func(t *testing.T) { - promoteForce = true - d := promoteDecision(rs, nil) + d := promoteDecision(rs, nil, true, "shipping anyway") if !d.Promoted || !d.Forced { t.Errorf("want forced promote: %+v", d) } - promoteForce = false }) } From d25873d50b36926c7cc55cfb1662391a82ba05b7 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Wed, 17 Jun 2026 00:07:44 -0400 Subject: [PATCH 05/10] fix(ops): graceful read-only fallback + no orphaned eval refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 3 greptile findings: - ListEvalRuns / ListAnnotations now return an empty result (not a raw "no such table" error) when their table is absent — the case a read-only open hits on a DB that predates migrations 0008/0009 (read-only opens skip migration apply). `qvr ops lineage` and `qvr audit annotations` therefore degrade gracefully on a freshly-upgraded install. New noSuchTable() guard. - DeleteSession nulls eval_runs.session_id for the deleted session instead of leaving it dangling. The eval verdict is durable lineage evidence keyed by {skill, commit}, so it outlives the session; only the stale pointer is cleared. --- internal/ops/store/annotations.go | 3 ++ internal/ops/store/annotations_test.go | 18 ++++++++++ internal/ops/store/eval_runs.go | 3 ++ internal/ops/store/eval_runs_test.go | 50 ++++++++++++++++++++++++++ internal/ops/store/scan.go | 13 ++++++- internal/ops/store/sqlite.go | 7 ++++ 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/internal/ops/store/annotations.go b/internal/ops/store/annotations.go index c3db406..3874991 100644 --- a/internal/ops/store/annotations.go +++ b/internal/ops/store/annotations.go @@ -80,6 +80,9 @@ func (s *sqliteStore) ListAnnotations(ctx context.Context, f *AnnotationFilter) rows, err := s.db.QueryContext(ctx, q, args...) if err != nil { + if noSuchTable(err) { + return nil, nil // pre-0008 DB opened read-only: no annotations yet, not an error + } return nil, fmt.Errorf("store: list annotations: %w", err) } defer rows.Close() diff --git a/internal/ops/store/annotations_test.go b/internal/ops/store/annotations_test.go index 7cfae24..b72e35a 100644 --- a/internal/ops/store/annotations_test.go +++ b/internal/ops/store/annotations_test.go @@ -56,6 +56,24 @@ func TestAnnotations_RequiresOutcome(t *testing.T) { } } +// TestListAnnotations_MissingTableIsEmpty pins the read-only-upgrade fallback: a +// DB predating migration 0008 yields an empty result, not a "no such table" +// error (the path `qvr audit annotations` / `ops lineage` hit read-only). +func TestListAnnotations_MissingTableIsEmpty(t *testing.T) { + st := openTestStore(t) + sq, ok := st.(*sqliteStore) + if !ok { + t.Fatal("expected *sqliteStore") + } + if _, err := sq.db.Exec(`DROP TABLE annotations;`); err != nil { + t.Fatalf("drop table: %v", err) + } + got, err := st.ListAnnotations(context.Background(), &AnnotationFilter{Skill: "x"}) + if err != nil || got != nil { + t.Errorf("missing table: got (%v, %v), want (nil, nil)", got, err) + } +} + // TestAnnotations_SurviveRederive proves the whole point of a separate table: // re-deriving a session (which replaces spans + session_meta) does NOT touch // the human verdict. diff --git a/internal/ops/store/eval_runs.go b/internal/ops/store/eval_runs.go index 26da6de..0be7fc6 100644 --- a/internal/ops/store/eval_runs.go +++ b/internal/ops/store/eval_runs.go @@ -110,6 +110,9 @@ func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*Ev rows, err := s.db.QueryContext(ctx, q, args...) if err != nil { + if noSuchTable(err) { + return nil, nil // pre-0009 DB opened read-only: no runs yet, not an error + } return nil, fmt.Errorf("store: list eval runs: %w", err) } defer rows.Close() diff --git a/internal/ops/store/eval_runs_test.go b/internal/ops/store/eval_runs_test.go index 75c63a0..b3da2e7 100644 --- a/internal/ops/store/eval_runs_test.go +++ b/internal/ops/store/eval_runs_test.go @@ -3,6 +3,8 @@ package store import ( "context" "testing" + + "github.com/google/uuid" ) // TestEvalRuns_RoundTrip pins the write/read path: a run and its case rows land @@ -64,3 +66,51 @@ func TestEvalRuns_RequiresRow(t *testing.T) { t.Error("expected an error for a nil eval run") } } + +// TestListEvalRuns_MissingTableIsEmpty pins the read-only-upgrade fallback: a DB +// that predates migration 0009 (tables absent, migrations skipped on a +// read-only open) yields an empty result, not a "no such table" error. +func TestListEvalRuns_MissingTableIsEmpty(t *testing.T) { + st := openTestStore(t) + sq, ok := st.(*sqliteStore) + if !ok { + t.Fatal("expected *sqliteStore") + } + if _, err := sq.db.Exec(`DROP TABLE eval_case_results; DROP TABLE eval_runs;`); err != nil { + t.Fatalf("drop tables: %v", err) + } + got, err := st.ListEvalRuns(context.Background(), &EvalRunFilter{SkillName: "x"}) + if err != nil || got != nil { + t.Errorf("missing table: got (%v, %v), want (nil, nil)", got, err) + } +} + +// TestDeleteSession_NullsEvalRunSession proves an eval verdict OUTLIVES the +// session it graded (durable lineage), with its now-stale session_id nulled. +func TestDeleteSession_NullsEvalRunSession(t *testing.T) { + st := openTestStore(t) + ctx := context.Background() + sid := uuid.New() + if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "claude-code", 1000, "guard-tests"), nil); err != nil { + t.Fatalf("derive: %v", err) + } + if _, err := st.PutEvalRun(ctx, &EvalRunRow{ + SkillName: "guard-tests", SkillCommit: "abc1234", Suite: "s", + SessionID: sid.String(), Passed: 1, Pass: true, + }); err != nil { + t.Fatalf("put eval run: %v", err) + } + if _, err := st.DeleteSession(ctx, sid); err != nil { + t.Fatalf("delete session: %v", err) + } + runs, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "guard-tests"}) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(runs) != 1 { + t.Fatalf("eval verdict did not survive session delete: got %d", len(runs)) + } + if runs[0].SessionID != "" { + t.Errorf("stale session_id not cleared: %q", runs[0].SessionID) + } +} diff --git a/internal/ops/store/scan.go b/internal/ops/store/scan.go index 6853183..2859c54 100644 --- a/internal/ops/store/scan.go +++ b/internal/ops/store/scan.go @@ -1,6 +1,17 @@ package store -import "database/sql" +import ( + "database/sql" + "strings" +) + +// noSuchTable reports a "missing table" error — the condition a read-only open +// hits when the DB predates a migration that adds a table (read-only opens skip +// migrations.Apply). Callers degrade to an empty result, mirroring the +// stale-column fallback the session_meta read path uses. +func noSuchTable(err error) bool { + return err != nil && strings.Contains(err.Error(), "no such table") +} // nullableString maps "" → SQL NULL so empty Go strings round-trip as NULL // columns rather than empty-string columns. diff --git a/internal/ops/store/sqlite.go b/internal/ops/store/sqlite.go index 77fa03f..9cde201 100644 --- a/internal/ops/store/sqlite.go +++ b/internal/ops/store/sqlite.go @@ -151,6 +151,13 @@ func (s *sqliteStore) DeleteSession(ctx context.Context, sessionID uuid.UUID) (i if _, err := tx.ExecContext(ctx, `DELETE FROM annotations WHERE session_id = ?`, id); err != nil { return 0, fmt.Errorf("store: delete session annotations: %w", err) } + // An eval_run is keyed by {skill, commit} and is durable lineage evidence, + // so it OUTLIVES the graded session — but its session_id would otherwise + // dangle. Null it instead of deleting the verdict, removing the stale + // pointer while keeping the gate history intact. + if _, err := tx.ExecContext(ctx, `UPDATE eval_runs SET session_id = NULL WHERE session_id = ?`, id); err != nil { + return 0, fmt.Errorf("store: clear session eval refs: %w", err) + } if err := tx.Commit(); err != nil { return 0, fmt.Errorf("store: delete session commit: %w", err) } From 53bdf9fd3834e7acca1f589d43215f129e0051d4 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Wed, 17 Jun 2026 00:15:47 -0400 Subject: [PATCH 06/10] fix(ops): actionable promote refusal + annotations created_at index Round 4 greptile findings: - ops promote: when a skill has no locked commit, the refusal now points straight at --force-no-eval instead of suggesting `qvr ops eval run`, which would record under an empty commit and loop back to the same refusal. - migration 0008: index annotations(created_at) so the --since range filter on `audit annotations` / `ops lineage` uses a B-tree seek, not a scan. --- cmd/ops_promote.go | 14 ++++++++++++-- cmd/ops_test.go | 8 +++++--- internal/ops/store/migrations/0008_annotations.sql | 2 ++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go index db7a508..8f42d78 100644 --- a/cmd/ops_promote.go +++ b/cmd/ops_promote.go @@ -68,12 +68,22 @@ func runOpsPromote(cmd *cobra.Command, args []string) error { printer.Info(decision.Message) } if !decision.Promoted { - return fmt.Errorf("refusing to promote %s: no passing eval for commit %s (run `qvr ops eval run %s`, or pass --force-no-eval)", - rs.Name, shortCommit(rs.Commit), rs.Name) + return promoteRefusalError(rs) } return nil } +// promoteRefusalError builds the refusal, guiding the user to the path that can +// actually unblock them. For an unlocked skill (no commit), `qvr ops eval run` +// would just record under an empty commit and loop back to this same refusal — +// so --force-no-eval is the only real way forward and the message says so. +func promoteRefusalError(rs *resolvedSkill) error { + if rs.Commit == "" { + return fmt.Errorf("refusing to promote %s: it has no locked commit to evidence-gate on — pass --force-no-eval to promote anyway, or install it through the lock so eval verdicts can pin to its commit", rs.Name) + } + return fmt.Errorf("refusing to promote %s: no passing eval for commit %s — run `qvr ops eval run %s`, or pass --force-no-eval", rs.Name, shortCommit(rs.Commit), rs.Name) +} + // promoteDecisionResult is the JSON/text shape of a promote check. type promoteDecisionResult struct { Skill string `json:"skill"` diff --git a/cmd/ops_test.go b/cmd/ops_test.go index f972e2c..17fac15 100644 --- a/cmd/ops_test.go +++ b/cmd/ops_test.go @@ -64,9 +64,11 @@ func TestOpsPromote_Gate(t *testing.T) { dir := writeGuardTestsFixture(t) seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "guard-tests", Outcome: "success", Tools: []string{"Bash"}}) - // No eval recorded yet → refuse. - if _, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir); err == nil { - t.Error("expected promote to refuse without a passing eval") + // No eval recorded yet (and no locked commit) → refuse, and point at the + // only path that can actually unblock: --force-no-eval. + _, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir) + if err == nil || !strings.Contains(err.Error(), "force-no-eval") { + t.Errorf("want a refusal guiding to --force-no-eval, got %v", err) } // Forced → allowed, and the JSON records the override. diff --git a/internal/ops/store/migrations/0008_annotations.sql b/internal/ops/store/migrations/0008_annotations.sql index e35f90b..75c3cb6 100644 --- a/internal/ops/store/migrations/0008_annotations.sql +++ b/internal/ops/store/migrations/0008_annotations.sql @@ -17,3 +17,5 @@ CREATE TABLE IF NOT EXISTS annotations ( ); CREATE INDEX IF NOT EXISTS idx_annotations_session ON annotations(session_id); CREATE INDEX IF NOT EXISTS idx_annotations_skill ON annotations(skill); +-- created_at backs the --since range filter on `audit annotations` / `ops lineage`. +CREATE INDEX IF NOT EXISTS idx_annotations_created ON annotations(created_at); From ada7d2ba8dbe4af06d590882df6227e18817965e Mon Sep 17 00:00:00 2001 From: Rakshith Date: Wed, 17 Jun 2026 00:24:21 -0400 Subject: [PATCH 07/10] fix(eval): validate text `on` field + document annotations no-FK choice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 5 greptile findings: - text grader: the `on` field is now validated at load time (only final_message is accepted), so a typo like `on: message` is a manifest error instead of silently grading FinalMessage. evidenceField returns "" for any unknown field as defense-in-depth. - migration 0008: document why annotations deliberately has NO session_meta FK — a cascade FK would be fired by the rederive INSERT-OR-REPLACE and wipe the durable verdicts the table exists to keep; cleanup on a real purge lives in store.DeleteSession. --- internal/eval/eval_test.go | 2 ++ internal/eval/graders.go | 15 ++++++++++++--- .../ops/store/migrations/0008_annotations.sql | 7 +++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/internal/eval/eval_test.go b/internal/eval/eval_test.go index 489a590..acc0423 100644 --- a/internal/eval/eval_test.go +++ b/internal/eval/eval_test.go @@ -21,6 +21,8 @@ func TestParse_RejectsNoOpGraders(t *testing.T) { {"outcome no expect", "type: outcome", true}, {"text ok", "type: text\n contains: [\"x\"]", false}, {"text empty", "type: text", true}, + {"text on final_message", "type: text\n on: final_message\n contains: [\"x\"]", false}, + {"text on unknown", "type: text\n on: message\n contains: [\"x\"]", true}, {"sequence empty", "type: tool_sequence", true}, {"constraint empty", "type: tool_constraint", true}, {"constraint maxTools ok", "type: tool_constraint\n maxTools: 5", false}, diff --git a/internal/eval/graders.go b/internal/eval/graders.go index f1bfa7d..80149fe 100644 --- a/internal/eval/graders.go +++ b/internal/eval/graders.go @@ -51,7 +51,13 @@ var graderValidators = map[string]func(GraderSpec) error{ return need(g.Expect != "", "needs `expect`") }, "text": func(g GraderSpec) error { - return need(len(g.Contains) > 0 || len(g.Reject) > 0, "needs `contains` or `reject`") + switch { + case len(g.Contains) == 0 && len(g.Reject) == 0: + return fmt.Errorf("needs `contains` or `reject`") + case g.On != "" && g.On != "final_message": + return fmt.Errorf("unknown `on` field %q (want final_message)", g.On) + } + return nil }, "tool_sequence": func(g GraderSpec) error { return need(len(g.Sequence) > 0, "needs `sequence`") @@ -114,13 +120,16 @@ func gradeText(g GraderSpec, e *Evidence) GraderResult { } // evidenceField selects the text body a text grader reads. final_message is the -// default and only field today; an unknown name falls back to it. +// default and only field today. An unknown name returns "" (defensive: the +// manifest validator already rejects unknown `on` values, so a typo fails at +// load time rather than silently grading the wrong field here). Add new fields +// to this switch AND to the text validator together. func evidenceField(name string, e *Evidence) string { switch name { case "", "final_message": return e.FinalMessage default: - return e.FinalMessage + return "" } } diff --git a/internal/ops/store/migrations/0008_annotations.sql b/internal/ops/store/migrations/0008_annotations.sql index 75c3cb6..a45659c 100644 --- a/internal/ops/store/migrations/0008_annotations.sql +++ b/internal/ops/store/migrations/0008_annotations.sql @@ -7,6 +7,13 @@ -- skill scopes the verdict to one skill within the session. Append-only: a -- session can accrue several verdicts over time (the latest by created_at wins -- for consumers that want one). +-- +-- DELIBERATELY no `REFERENCES session_meta(session_id)` FK: the rederive write +-- path is `INSERT OR REPLACE INTO session_meta`, and under REPLACE SQLite first +-- DELETEs the conflicting row — which would fire an ON DELETE CASCADE and wipe +-- every annotation on each rederive, destroying the durable human verdicts this +-- table exists to preserve. Cleanup on a genuine session purge is handled +-- explicitly in store.DeleteSession instead (a deliberate app-layer invariant). CREATE TABLE IF NOT EXISTS annotations ( session_id TEXT NOT NULL, skill TEXT, From 4811cfacbc1062fc5ff229bda2060bcd54ef2092 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Wed, 17 Jun 2026 01:03:28 -0400 Subject: [PATCH 08/10] fix(ops): gate honors newest verdict + drop ListEvalRuns N+1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 6 greptile findings: - promote gate: latestPassingEval now clears only when the MOST RECENT eval for the locked commit passed — a newer failing run is no longer overridden by an older pass, so a known regression can't be promoted past. Pinned by TestLatestPassingEval_NewestVerdictWins. - ListEvalRuns: per-case rows now load only when EvalRunFilter.IncludeCases is set, so header-only callers (lineage, the promote gate) skip the N+1. WHERE-building and case-loading extracted to helpers (keeps cyclo <=15). --- cmd/ops_promote.go | 22 +++++----- cmd/ops_test.go | 50 ++++++++++++++++++++++ internal/ops/store/eval_runs.go | 64 +++++++++++++++++++--------- internal/ops/store/eval_runs_test.go | 2 +- 4 files changed, 105 insertions(+), 33 deletions(-) diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go index 8f42d78..7a902df 100644 --- a/cmd/ops_promote.go +++ b/cmd/ops_promote.go @@ -110,24 +110,24 @@ func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow, force bool, r return d } -// latestPassingEval returns the newest passing eval run for the skill's locked -// commit, or nil when there is none. A skill with no locked commit ("") can't be -// evidence-gated by commit, so it returns nil (promotion then requires -// --force-no-eval). A store error is propagated, NOT swallowed — otherwise a -// transient DB failure would read as "no passing eval" and silently block (or -// mislead) a CI gate. +// latestPassingEval returns the commit's eval verdict when it is clear to +// promote: the MOST RECENT run for the locked commit, but only if that run +// passed. The freshest verdict governs — a newer failing run is NOT overridden +// by an older pass, so a known regression can't be promoted past. A skill with +// no locked commit ("") can't be evidence-gated by commit, so it returns nil +// (promotion then requires --force-no-eval). A store error is propagated, NOT +// swallowed — otherwise a transient DB failure would read as "no passing eval" +// and silently block (or mislead) a CI gate. func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) (*store.EvalRunRow, error) { if rs.Commit == "" { return nil, nil } - runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit}) + runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit, Limit: 1}) if err != nil { return nil, err } - for _, r := range runs { // newest-first - if r.Pass { - return r, nil - } + if len(runs) > 0 && runs[0].Pass { // newest-first; the latest verdict wins + return runs[0], nil } return nil, nil } diff --git a/cmd/ops_test.go b/cmd/ops_test.go index 17fac15..2220552 100644 --- a/cmd/ops_test.go +++ b/cmd/ops_test.go @@ -1,14 +1,64 @@ package cmd import ( + "context" "encoding/json" + "path/filepath" "strings" "testing" + "time" "github.com/astra-sh/qvr/internal/config" "github.com/astra-sh/qvr/internal/ops/store" + "github.com/spf13/cobra" ) +// TestLatestPassingEval_NewestVerdictWins pins the gate semantics: the most +// recent eval for the locked commit governs, so a newer FAIL is not overridden +// by an older PASS. +func TestLatestPassingEval_NewestVerdictWins(t *testing.T) { + ctx := context.Background() + st, err := store.Open(ctx, store.OpenOptions{Path: filepath.Join(t.TempDir(), "skillops.db")}) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer st.Close() + + put := func(ts int64, pass bool) { + t.Helper() + if _, e := st.PutEvalRun(ctx, &store.EvalRunRow{ + SkillName: "g", SkillCommit: "c1", Suite: "s", + StartedAt: time.Unix(ts, 0).UTC(), Pass: pass, + }); e != nil { + t.Fatalf("put: %v", e) + } + } + rs := &resolvedSkill{Name: "g", Commit: "c1"} + cmd := &cobra.Command{} + cmd.SetContext(ctx) // cobra.Command.Context() is nil until set + + // older PASS, newer FAIL → gate must NOT clear. + put(1000, true) + put(2000, false) + got, err := latestPassingEval(cmd, st, rs) + if err != nil { + t.Fatalf("latestPassingEval: %v", err) + } + if got != nil { + t.Errorf("newest verdict is a FAIL; gate must not clear (got run #%d)", got.ID) + } + + // a fresh PASS lands → gate clears again. + put(3000, true) + got, err = latestPassingEval(cmd, st, rs) + if err != nil { + t.Fatalf("latestPassingEval: %v", err) + } + if got == nil { + t.Error("newest verdict is a PASS; gate should clear") + } +} + // TestOpsEval_NoDatabase pins the helpful error when nothing has been captured. func TestOpsEval_NoDatabase(t *testing.T) { isolatedHome(t, true) diff --git a/internal/ops/store/eval_runs.go b/internal/ops/store/eval_runs.go index 0be7fc6..404470a 100644 --- a/internal/ops/store/eval_runs.go +++ b/internal/ops/store/eval_runs.go @@ -39,6 +39,10 @@ type EvalRunFilter struct { SkillCommit string Since *time.Time Limit int + // IncludeCases loads each run's per-case verdict rows (an extra query per + // run). Off by default so header-only callers (lineage, the promote gate) + // don't pay an N+1 they never read. + IncludeCases bool } // PutEvalRun inserts a run and its case rows in one tx, returning the new run id. @@ -79,25 +83,11 @@ func (s *sqliteStore) PutEvalRun(ctx context.Context, r *EvalRunRow) (int64, err return id, nil } -// ListEvalRuns returns runs matching the filter, newest-first, each with its -// case rows loaded. +// ListEvalRuns returns runs matching the filter, newest-first. Per-case rows are +// loaded only when the filter sets IncludeCases (header-only callers skip the +// N+1). func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*EvalRunRow, error) { - var where []string - var args []any - if f != nil { - if f.SkillName != "" { - where = append(where, "skill_name = ?") - args = append(args, f.SkillName) - } - if f.SkillCommit != "" { - where = append(where, "skill_commit = ?") - args = append(args, f.SkillCommit) - } - if f.Since != nil { - where = append(where, "started_at >= ?") - args = append(args, f.Since.UTC()) - } - } + where, args := evalRunWhere(f) q := `SELECT id, skill_name, skill_commit, suite, session_id, started_at, passed, failed, pass FROM eval_runs` if len(where) > 0 { q += " WHERE " + strings.Join(where, " AND ") @@ -128,14 +118,46 @@ func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*Ev if err := rows.Err(); err != nil { return nil, err } - for _, r := range out { + if f != nil && f.IncludeCases { + if err := s.attachEvalCases(ctx, out); err != nil { + return nil, err + } + } + return out, nil +} + +// evalRunWhere builds the WHERE clauses and args for an eval-run filter. +func evalRunWhere(f *EvalRunFilter) ([]string, []any) { + var where []string + var args []any + if f == nil { + return where, args + } + if f.SkillName != "" { + where = append(where, "skill_name = ?") + args = append(args, f.SkillName) + } + if f.SkillCommit != "" { + where = append(where, "skill_commit = ?") + args = append(args, f.SkillCommit) + } + if f.Since != nil { + where = append(where, "started_at >= ?") + args = append(args, f.Since.UTC()) + } + return where, args +} + +// attachEvalCases loads each run's per-case verdict rows in place. +func (s *sqliteStore) attachEvalCases(ctx context.Context, runs []*EvalRunRow) error { + for _, r := range runs { cases, err := s.evalCases(ctx, r.ID) if err != nil { - return nil, err + return err } r.Cases = cases } - return out, nil + return nil } func (s *sqliteStore) evalCases(ctx context.Context, runID int64) ([]EvalCaseRow, error) { diff --git a/internal/ops/store/eval_runs_test.go b/internal/ops/store/eval_runs_test.go index b3da2e7..53d08cc 100644 --- a/internal/ops/store/eval_runs_test.go +++ b/internal/ops/store/eval_runs_test.go @@ -34,7 +34,7 @@ func TestEvalRuns_RoundTrip(t *testing.T) { t.Fatal("expected a non-zero run id") } - all, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue"}) + all, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue", IncludeCases: true}) if err != nil { t.Fatalf("list: %v", err) } From f7765d25e55d924ed2014f659f81247debb59428 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Wed, 17 Jun 2026 01:19:40 -0400 Subject: [PATCH 09/10] chore(ops): drop dead OutcomeUnknown const + document eval_runs no-FK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 7 greptile findings (style/docs only): - Remove the unused OutcomeUnknown constant — "unknown" is the ABSENCE of the attribute (""), and every comparison site already checks "", so the sentinel was dead and potentially misleading. - migration 0009: document why eval_runs.session_id deliberately omits a FK (durable verdict outlives the session; DeleteSession NULLs the pointer), mirroring the rationale comment migration 0008 carries. --- internal/ops/derive/span.go | 4 +++- internal/ops/store/migrations/0009_eval_runs.sql | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/internal/ops/derive/span.go b/internal/ops/derive/span.go index 5724c41..178a188 100644 --- a/internal/ops/derive/span.go +++ b/internal/ops/derive/span.go @@ -44,11 +44,13 @@ const ( // read model rolls these up into SessionMeta.Outcome (worst-of-spans). const OutcomeKey = "qvr.outcome" +// The three outcome values a span can carry. "Unknown" is NOT a value — it is +// the ABSENCE of the attribute (and the empty session rollup), so every +// comparison site checks against "" rather than a sentinel string. const ( OutcomeSuccess = "success" OutcomeFailure = "failure" OutcomeBlocked = "blocked" - OutcomeUnknown = "unknown" ) // classifyOutcome maps a tool result to an outcome. A non-error result is a diff --git a/internal/ops/store/migrations/0009_eval_runs.sql b/internal/ops/store/migrations/0009_eval_runs.sql index de12808..1075658 100644 --- a/internal/ops/store/migrations/0009_eval_runs.sql +++ b/internal/ops/store/migrations/0009_eval_runs.sql @@ -3,12 +3,17 @@ -- skill's evals.yaml; the verdict is pinned to the exact locked commit that -- ran, so lineage can show "at commit abc the safety suite failed; at def it -- passed". Per-case rows carry the granular pass/fail + reason. +-- session_id DELIBERATELY has no FK to session_meta: an eval_run is durable +-- lineage evidence keyed by {skill, commit} and OUTLIVES the session it graded. +-- store.DeleteSession NULLs this pointer on a genuine purge (keeping the verdict, +-- dropping the stale id) rather than cascading the verdict away — the same +-- app-layer-invariant choice migration 0008 makes for annotations. CREATE TABLE IF NOT EXISTS eval_runs ( id INTEGER PRIMARY KEY AUTOINCREMENT, skill_name TEXT NOT NULL, skill_commit TEXT NOT NULL, suite TEXT NOT NULL, -- requested suite, or '*' for all - session_id TEXT, -- the graded session + session_id TEXT, -- the graded session (nullable; see note above) started_at DATETIME NOT NULL, passed INTEGER NOT NULL, failed INTEGER NOT NULL, From dfa5cbfd5e93ce053a266efce1e8606c83977098 Mon Sep 17 00:00:00 2001 From: Rakshith Date: Wed, 17 Jun 2026 01:29:17 -0400 Subject: [PATCH 10/10] docs(evolve-skill): verified install in CI recipe, not curl|sh Round 8 greptile finding (security): the CI recipe piped `curl -fsSL https://quiver.sh/install | sh`, so an adopter copying it verbatim would execute unauthenticated network content in the runner with full secrets access. Replace both jobs' install step with a pinned `go install github.com/astra-sh/qvr@`, verified through the Go module checksum database. --- skills/evolve-skill/references/ci-recipe.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/skills/evolve-skill/references/ci-recipe.md b/skills/evolve-skill/references/ci-recipe.md index 3dd69bc..453687e 100644 --- a/skills/evolve-skill/references/ci-recipe.md +++ b/skills/evolve-skill/references/ci-recipe.md @@ -18,7 +18,13 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - run: curl -fsSL https://quiver.sh/install | sh # install qvr + - uses: actions/setup-go@v5 + with: { go-version: '1.25' } + # Install a PINNED version, verified through the Go module checksum + # database — unlike `curl … | sh`, which executes unauthenticated network + # content in the runner (a MITM / compromised CDN / DNS hijack would run + # arbitrary code with full CI secrets access). Pin the tag you trust. + - run: go install github.com/astra-sh/qvr@v0.28.1 - run: qvr audit enable && qvr audit discover # Grade each skill that ships an evals.yaml; non-zero exit fails the job. - run: |