From 05d1a12744d985e35fa5e7369ba9e3a0182d1193 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Tue, 16 Jun 2026 23:41:18 -0400
Subject: [PATCH 01/10] =?UTF-8?q?feat(ops):=20skill=20self-improvement=20s?=
 =?UTF-8?q?ubstrate=20=E2=80=94=20outcome,=20annotations,=20evals?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the qvr-core layer for evidence-gated skill evolution, keeping the
binary pure Go (no model calls, no scheduler):

- Outcome signal: TOOL/SKILL spans carry qvr.outcome (success|failure|
  blocked), rolled up to a session verdict; OTLP span status now reflects
  it instead of a hardcoded OK. Deriver v7->v8 (rederive backfills).
  Migration 0007.
- Human annotations: `qvr audit annotate` / `audit annotations` record a
  reviewer's verdict in a table that survives rederive (it is durable human
  input, not a regenerable projection). Migration 0008.
- Evals-as-data: evals.yaml manifest + deterministic graders (outcome,
  text, tool_sequence, tool_constraint, skill_invocation, behavior) graded
  over the trace qvr already captured — no model calls. internal/eval/.
- Eval store + gate: `qvr ops eval run` (exits non-zero on regression),
  `ops lineage`, `ops promote`, keyed by {skill, commit}. Migration 0009.

The seam that keeps core pure: execution stays with the agent; grading,
the gate, and lineage are the only parts in the binary, because qvr already
has the real lock-verified trace.
---
 cmd/acceptance_behavioral_test.go             | 109 +++++++++
 cmd/acceptance_loop_test.go                   | 213 +++++++++++++++++
 cmd/audit.go                                  |   8 +-
 cmd/audit_annotate.go                         | 112 +++++++++
 cmd/audit_annotate_test.go                    |  84 +++++++
 cmd/audit_annotations.go                      | 106 +++++++++
 cmd/audit_test.go                             |   6 +
 cmd/ops.go                                    | 126 ++++++++++
 cmd/ops_eval.go                               | 174 ++++++++++++++
 cmd/ops_lineage.go                            | 148 ++++++++++++
 cmd/ops_promote.go                            | 117 ++++++++++
 cmd/ops_test.go                               | 126 ++++++++++
 internal/eval/eval_test.go                    | 217 ++++++++++++++++++
 internal/eval/evidence.go                     |  95 ++++++++
 internal/eval/grade.go                        |  50 ++++
 internal/eval/graders.go                      | 205 +++++++++++++++++
 internal/eval/manifest.go                     | 157 +++++++++++++
 internal/eval/runner.go                       | 107 +++++++++
 internal/ops/derive/claude.go                 |   4 +-
 internal/ops/derive/derive.go                 |  44 +++-
 internal/ops/derive/outcome_test.go           | 156 +++++++++++++
 internal/ops/derive/shared.go                 |   1 +
 internal/ops/derive/span.go                   |  75 +++++-
 internal/ops/rawtrace/capture.go              |   1 +
 internal/ops/store/annotations.go             | 118 ++++++++++
 internal/ops/store/annotations_test.go        | 104 +++++++++
 internal/ops/store/eval_runs.go               | 177 ++++++++++++++
 internal/ops/store/eval_runs_test.go          |  66 ++++++
 .../migrations/0007_session_meta_outcome.sql  |   6 +
 .../ops/store/migrations/0008_annotations.sql |  19 ++
 .../ops/store/migrations/0009_eval_runs.sql   |  27 +++
 internal/ops/store/scan.go                    |   8 +
 internal/ops/store/session_meta.go            |  60 +++--
 internal/ops/store/session_meta_test.go       |  29 +++
 internal/ops/store/sqlite.go                  |   6 +
 internal/ops/store/store.go                   |  14 ++
 36 files changed, 3050 insertions(+), 25 deletions(-)
 create mode 100644 cmd/acceptance_behavioral_test.go
 create mode 100644 cmd/acceptance_loop_test.go
 create mode 100644 cmd/audit_annotate.go
 create mode 100644 cmd/audit_annotate_test.go
 create mode 100644 cmd/audit_annotations.go
 create mode 100644 cmd/ops.go
 create mode 100644 cmd/ops_eval.go
 create mode 100644 cmd/ops_lineage.go
 create mode 100644 cmd/ops_promote.go
 create mode 100644 cmd/ops_test.go
 create mode 100644 internal/eval/eval_test.go
 create mode 100644 internal/eval/evidence.go
 create mode 100644 internal/eval/grade.go
 create mode 100644 internal/eval/graders.go
 create mode 100644 internal/eval/manifest.go
 create mode 100644 internal/eval/runner.go
 create mode 100644 internal/ops/derive/outcome_test.go
 create mode 100644 internal/ops/store/annotations.go
 create mode 100644 internal/ops/store/annotations_test.go
 create mode 100644 internal/ops/store/eval_runs.go
 create mode 100644 internal/ops/store/eval_runs_test.go
 create mode 100644 internal/ops/store/migrations/0007_session_meta_outcome.sql
 create mode 100644 internal/ops/store/migrations/0008_annotations.sql
 create mode 100644 internal/ops/store/migrations/0009_eval_runs.sql

diff --git a/cmd/acceptance_behavioral_test.go b/cmd/acceptance_behavioral_test.go
new file mode 100644
index 0000000..1b6bb9a
--- /dev/null
+++ b/cmd/acceptance_behavioral_test.go
@@ -0,0 +1,109 @@
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/astra-sh/qvr/internal/config"
+)
+
+// TestAcceptance_BehavioralGate exercises the loop on a DIFFERENT dimension than
+// the triage label-match: a skill that must verify its edits by running tests.
+// The gate here is behavioral — tool sequence, tool constraints, skill
+// invocation, efficiency — so it proves the eval substrate generalizes past
+// simple text matching. First run edits without testing (fails); the corrected
+// run edits THEN tests (passes).
+func TestAcceptance_BehavioralGate(t *testing.T) {
+	isolatedHome(t, true)
+	cfg, err := config.Load()
+	if err != nil {
+		t.Fatalf("load config: %v", err)
+	}
+	skillDir := writeGuardTestsFixture(t)
+
+	// 1. A run that edited code but never ran the tests (Read, Edit — no Bash).
+	badID := seedSession(t, cfg, sessionSeed{
+		StartedMs: 1000, Skill: "guard-tests", FinalMsg: "Applied the change.",
+		Outcome: "success", Tools: []string{"Read", "Edit"},
+	})
+
+	// 2. Reviewer flags the unverified change.
+	if _, stderr, err := runRoot(t, nil, "audit", "annotate", badID,
+		"--skill", "guard-tests", "--outcome", "bad", "--note", "shipped without running tests"); err != nil {
+		t.Fatalf("annotate: err=%v stderr=%q", err, stderr)
+	}
+
+	// 3. Gate FAILS: tool_sequence Edit→Bash unsatisfied, Bash missing.
+	if _, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests",
+		"--skill-dir", skillDir, "--output", "json"); err == nil {
+		t.Fatal("expected the baseline eval to FAIL for an unverified change")
+	}
+
+	// 4. Corrected run: Read, Edit, THEN Bash (runs the tests).
+	seedSession(t, cfg, sessionSeed{
+		StartedMs: 2000, Skill: "guard-tests", FinalMsg: "Applied the change and ran the tests.",
+		Outcome: "success", Tools: []string{"Read", "Edit", "Bash"},
+	})
+
+	// 5. Gate PASSES.
+	out, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests",
+		"--skill-dir", skillDir, "--output", "json")
+	if err != nil {
+		t.Fatalf("expected the post-fix eval to PASS, got err=%v", err)
+	}
+	var res struct {
+		Pass   bool `json:"pass"`
+		Failed int  `json:"failed"`
+	}
+	if e := json.Unmarshal([]byte(out), &res); e != nil {
+		t.Fatalf("decode eval json: %v\n%s", e, out)
+	}
+	if !res.Pass || res.Failed != 0 {
+		t.Fatalf("post-fix eval = %+v, want pass with 0 failures", res)
+	}
+}
+
+// writeGuardTestsFixture creates a fixture skill whose suite asserts the skill
+// fired, edits were followed by a test run, and no network was used — all
+// behavioral graders over the captured trace.
+func writeGuardTestsFixture(t *testing.T) string {
+	t.Helper()
+	dir := filepath.Join(t.TempDir(), "guard-tests")
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	skill := `---
+name: guard-tests
+description: Edits code and always verifies the change by running the test suite.
+metadata:
+  author: quiver-playground
+  version: "1.0.0"
+---
+
+# Guard tests
+
+After editing code, always run the test suite to verify the change before
+reporting done. Never ship an edit you have not verified.
+`
+	evals := `version: 1
+suites:
+  - name: verifies-changes
+    cases:
+      - name: edits-then-runs-tests
+        graders:
+          - type: skill_invocation
+            expectSkills: ["guard-tests"]
+          - type: tool_sequence
+            sequence: ["Edit", "Bash"]
+          - type: tool_constraint
+            expectTools: ["Bash"]
+            rejectTools: ["WebFetch"]
+          - type: behavior
+            maxTools: 10
+`
+	mustWrite(t, filepath.Join(dir, "SKILL.md"), skill)
+	mustWrite(t, filepath.Join(dir, "evals.yaml"), evals)
+	return dir
+}
diff --git a/cmd/acceptance_loop_test.go b/cmd/acceptance_loop_test.go
new file mode 100644
index 0000000..1501187
--- /dev/null
+++ b/cmd/acceptance_loop_test.go
@@ -0,0 +1,213 @@
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/ops"
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/google/uuid"
+)
+
+// TestAcceptance_SelfImprovementLoop recreates the article's triage loop end to
+// end through the real CLI: a skill mis-triages an issue, a human flags it, the
+// eval gate FAILS, the skill is "improved" (a corrected run), and the gate now
+// PASSES — with the whole arc visible in `qvr ops lineage`. The fail→pass is the
+// branch's acceptance criterion.
+func TestAcceptance_SelfImprovementLoop(t *testing.T) {
+	isolatedHome(t, true)
+	cfg, err := config.Load()
+	if err != nil {
+		t.Fatalf("load config: %v", err)
+	}
+	skillDir := writeTriageFixture(t)
+
+	// 1. INNER LOOP: a real run that mis-triaged the issue (the article's
+	//    "ready-to-implement" mistake). Captured + skill-attributed.
+	badID := seedTriageSession(t, cfg, 1000, "Labeled this issue ready-to-implement.")
+
+	// 2. HUMAN FEEDBACK: the reviewer flips the verdict and says why.
+	if _, stderr, err := runRoot(t, nil, "audit", "annotate", badID,
+		"--skill", "triage-issue", "--outcome", "bad", "--note", "ambiguous — needs a setting, should be needs-info"); err != nil {
+		t.Fatalf("annotate: err=%v stderr=%q", err, stderr)
+	}
+
+	// 3. BASELINE GATE: the eval must FAIL on the misclassification.
+	if _, _, err := runRoot(t, nil, "ops", "eval", "run", "triage-issue",
+		"--skill-dir", skillDir, "--suite", "triage-correctness", "--output", "json"); err == nil {
+		t.Fatal("expected the baseline eval to FAIL on the misclassified run")
+	}
+
+	// 4. IMPROVE: the loop edits the skill and re-runs it; the corrected run is
+	//    captured (newer, so it is the most-recent session the eval grades).
+	seedTriageSession(t, cfg, 2000, "Labeled this issue needs-info pending a decision.")
+
+	// 5. POST-FIX GATE: the same eval must now PASS.
+	out, _, err := runRoot(t, nil, "ops", "eval", "run", "triage-issue",
+		"--skill-dir", skillDir, "--suite", "triage-correctness", "--output", "json")
+	if err != nil {
+		t.Fatalf("expected the post-fix eval to PASS, got err=%v", err)
+	}
+	var res struct {
+		Pass   bool `json:"pass"`
+		Passed int  `json:"passed"`
+		Failed int  `json:"failed"`
+	}
+	if e := json.Unmarshal([]byte(out), &res); e != nil {
+		t.Fatalf("decode eval json: %v\n%s", e, out)
+	}
+	if !res.Pass || res.Failed != 0 {
+		t.Fatalf("post-fix eval = %+v, want pass with 0 failures", res)
+	}
+
+	// 6. LINEAGE: the timeline shows the fail, then the pass, plus the verdict.
+	lo, _, err := runRoot(t, nil, "ops", "lineage", "triage-issue", "--output", "json")
+	if err != nil {
+		t.Fatalf("lineage: %v", err)
+	}
+	assertFailPassArc(t, lo)
+}
+
+// assertFailPassArc checks the lineage timeline carries the article's full arc:
+// at least one failed eval, one passed eval, and one human annotation.
+func assertFailPassArc(t *testing.T, lineageJSON string) {
+	t.Helper()
+	var timeline []struct {
+		Kind string `json:"kind"`
+		Pass *bool  `json:"pass"`
+	}
+	if e := json.Unmarshal([]byte(lineageJSON), &timeline); e != nil {
+		t.Fatalf("decode lineage json: %v\n%s", e, lineageJSON)
+	}
+	evalPass, evalFail, annotations := 0, 0, 0
+	for _, e := range timeline {
+		switch {
+		case e.Kind == "annotation":
+			annotations++
+		case e.Pass != nil && *e.Pass:
+			evalPass++
+		default:
+			evalFail++
+		}
+	}
+	if evalPass < 1 || evalFail < 1 || annotations < 1 {
+		t.Errorf("lineage missing the fail→pass arc: %d pass, %d fail, %d annotations", evalPass, evalFail, annotations)
+	}
+}
+
+// writeTriageFixture creates a fixture triage-issue skill with an evals.yaml
+// whose correctness suite distinguishes the right label from the wrong one.
+func writeTriageFixture(t *testing.T) string {
+	t.Helper()
+	dir := filepath.Join(t.TempDir(), "triage-issue")
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	skill := `---
+name: triage-issue
+description: Sorts incoming issues into ready-to-implement, duplicate, or needs-info.
+metadata:
+  author: quiver-playground
+  version: "1.0.0"
+---
+
+# Triage an issue
+
+Classify each incoming issue into exactly one bucket: ready-to-implement,
+duplicate, or needs-info. When a feature request leaves an ambiguity (e.g.
+whether to add a setting), prefer needs-info over ready-to-implement.
+`
+	evals := `version: 1
+suites:
+  - name: triage-correctness
+    cases:
+      - name: ambiguous-feature-needs-info
+        graders:
+          - type: outcome
+            expect: success
+          - type: text
+            on: final_message
+            contains: ["needs-info"]
+            reject: ["ready-to-implement"]
+`
+	mustWrite(t, filepath.Join(dir, "SKILL.md"), skill)
+	mustWrite(t, filepath.Join(dir, "evals.yaml"), evals)
+	return dir
+}
+
+func mustWrite(t *testing.T, path, content string) {
+	t.Helper()
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatalf("write %s: %v", path, err)
+	}
+}
+
+// seedTriageSession seeds a successful triage-issue run ending with finalMsg,
+// using a single Bash tool. Thin wrapper over seedSession.
+func seedTriageSession(t *testing.T, cfg *config.Config, startedMs int64, finalMsg string) string {
+	t.Helper()
+	return seedSession(t, cfg, sessionSeed{
+		StartedMs: startedMs, Skill: "triage-issue", FinalMsg: finalMsg,
+		Outcome: "success", Tools: []string{"Bash"},
+	})
+}
+
+// sessionSeed describes a synthetic captured session to inject straight into the
+// audit store (bypassing agent-store discovery), so a test controls exactly the
+// evidence the graders read: the skill that fired, the ordered tool calls, the
+// session outcome, and the final assistant message.
+type sessionSeed struct {
+	StartedMs int64
+	Skill     string
+	FinalMsg  string
+	Outcome   string
+	Tools     []string // ordered tool names (TOOL spans), after the SKILL span
+}
+
+// seedSession writes the synthetic session and returns its id.
+func seedSession(t *testing.T, cfg *config.Config, seed sessionSeed) string {
+	t.Helper()
+	ctx := context.Background()
+	s, err := store.Open(ctx, store.OpenOptions{Path: ops.DBPath(cfg)})
+	if err != nil {
+		t.Fatalf("open store: %v", err)
+	}
+	defer s.Close()
+
+	sid := uuid.New()
+	outMsgs, _ := json.Marshal([]map[string]string{{"role": "assistant", "content": seed.FinalMsg}})
+	tid := "trace-" + sid.String()[:8]
+	end := seed.StartedMs + 1000
+
+	spans := []*store.SpanRow{
+		{SpanID: tid + "-llm", TraceID: tid, SessionID: sid, AgentName: "claude-code",
+			Kind: "LLM", Name: "chat", StartMs: seed.StartedMs, EndMs: end,
+			Attributes: fmt.Sprintf(`{"gen_ai.output.messages":%q}`, string(outMsgs))},
+		{SpanID: tid + "-skill", TraceID: tid, SessionID: sid, AgentName: "claude-code",
+			Kind: "SKILL", Name: "execute_tool Skill", StartMs: seed.StartedMs + 50, EndMs: seed.StartedMs + 100,
+			Attributes: fmt.Sprintf(`{"gen_ai.tool.name":"Skill","skill.name":%q}`, seed.Skill)},
+	}
+	for i, tool := range seed.Tools {
+		at := seed.StartedMs + int64(100*(i+2))
+		spans = append(spans, &store.SpanRow{
+			SpanID: fmt.Sprintf("%s-tool%d", tid, i), TraceID: tid, SessionID: sid, AgentName: "claude-code",
+			Kind: "TOOL", Name: "execute_tool " + tool, StartMs: at, EndMs: at + 50,
+			Attributes: fmt.Sprintf(`{"gen_ai.tool.name":%q,"qvr.outcome":"success"}`, tool),
+		})
+	}
+	meta := &store.SessionMetaRow{
+		SessionID: sid, AgentName: "claude-code", Model: "claude-opus-4-8",
+		Title: seed.Skill + " run", StartedMs: seed.StartedMs, EndedMs: end,
+		Turns: 1, Tools: int64(len(seed.Tools)), Skills: []string{seed.Skill}, Outcome: seed.Outcome,
+		DeriverVersion: 8,
+	}
+	if err := s.ReplaceSessionDerivation(ctx, meta, spans); err != nil {
+		t.Fatalf("seed session: %v", err)
+	}
+	return sid.String()
+}
diff --git a/cmd/audit.go b/cmd/audit.go
index 0478cde..830453e 100644
--- a/cmd/audit.go
+++ b/cmd/audit.go
@@ -26,9 +26,11 @@ instantly). Each session's verbatim trace lands in a local SQLite database,
 attributed to the exact locked skill version that ran. Query it with
 'qvr audit logs' / 'qvr audit sessions'.
 
-The everyday surface is enable/disable, discover, status, sessions, logs, and
-export. The remaining subcommands (ingest, raw, spans, rederive, gc) are
-low-level plumbing the maintenance paths use and are hidden from this list.`,
+The everyday surface is enable/disable, discover, status, sessions, logs,
+export, and annotate/annotations (record and read human verdicts on what a
+skill actually did — the feedback a self-improvement loop reads). The remaining
+subcommands (ingest, raw, spans, rederive, gc) are low-level plumbing the
+maintenance paths use and are hidden from this list.`,
 	// Reject a typo'd subcommand (`qvr audit enabel`) with a non-zero exit
 	// instead of silently printing help (issue #169 — the #120 fix missed this
 	// parent). No args still prints help.
diff --git a/cmd/audit_annotate.go b/cmd/audit_annotate.go
new file mode 100644
index 0000000..9386baa
--- /dev/null
+++ b/cmd/audit_annotate.go
@@ -0,0 +1,112 @@
+package cmd
+
+import (
+	"fmt"
+	"os/user"
+	"time"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/google/uuid"
+	"github.com/spf13/cobra"
+)
+
+var (
+	annotateOutcome string
+	annotateNote    string
+	annotateSkill   string
+	annotateAuthor  string
+)
+
+var auditAnnotateCmd = &cobra.Command{
+	Use:   "annotate <session-id>",
+	Short: "Record a human verdict on a captured session",
+	Long: `Attaches a durable human verdict — the outer-loop feedback signal — to a
+captured session: did the skill do the right thing, and why?
+
+Unlike the auto-derived span outcome (which only reflects whether a tool
+errored), an annotation is a reviewer's judgement ("this triage was wrong, the
+issue was ambiguous") and is never overwritten by 'qvr audit rederive' — it
+lives outside the regenerable spans/session_meta projection. A self-improvement
+loop reads these verdicts (alongside the auto outcome) to decide what to fix.
+
+Scope a verdict to one skill with --skill, or omit it for a whole-session
+verdict. Annotations are append-only: a session can accrue several over time.`,
+	Args: cobra.ExactArgs(1),
+	RunE: runAuditAnnotate,
+}
+
+func init() {
+	auditAnnotateCmd.Flags().StringVar(&annotateOutcome, "outcome", "", "the verdict (e.g. good, bad, blocked) — required")
+	auditAnnotateCmd.Flags().StringVar(&annotateNote, "note", "", "why — the reason for the verdict")
+	auditAnnotateCmd.Flags().StringVar(&annotateSkill, "skill", "", "scope the verdict to one skill (omit for whole-session)")
+	auditAnnotateCmd.Flags().StringVar(&annotateAuthor, "author", "", "verdict author (defaults to the current OS user)")
+	auditCmd.AddCommand(auditAnnotateCmd)
+}
+
+func runAuditAnnotate(cmd *cobra.Command, args []string) error {
+	if annotateOutcome == "" {
+		return fmt.Errorf("--outcome is required (e.g. --outcome bad)")
+	}
+	id, err := uuid.Parse(args[0])
+	if err != nil {
+		return fmt.Errorf("invalid session id %q: %w", args[0], err)
+	}
+
+	cfg, err := config.Load()
+	if err != nil {
+		return err
+	}
+	if !auditDBExists(cfg) {
+		return fmt.Errorf("no audit database yet — run `qvr audit discover` first")
+	}
+
+	// A verdict is a write, so open read-write (this also applies migration
+	// 0008 if the DB predates the annotations table).
+	s, err := openAuditStore(cmd.Context(), cfg, false)
+	if err != nil {
+		return fmt.Errorf("open audit store: %w", err)
+	}
+	defer s.Close()
+
+	meta, err := s.GetSessionMeta(cmd.Context(), id)
+	if err != nil {
+		return fmt.Errorf("look up session: %w", err)
+	}
+	if meta == nil {
+		return fmt.Errorf("no captured session %s — run `qvr audit sessions` to list ids", id)
+	}
+
+	a := &store.AnnotationRow{
+		SessionID: id,
+		Skill:     annotateSkill,
+		Outcome:   annotateOutcome,
+		Note:      annotateNote,
+		Author:    annotateAuthor,
+		CreatedAt: time.Now().UTC(),
+	}
+	if a.Author == "" {
+		a.Author = currentUserName()
+	}
+	if err := s.PutAnnotation(cmd.Context(), a); err != nil {
+		return err
+	}
+
+	if outputFormat == "json" {
+		return printer.JSON(a)
+	}
+	scope := "session"
+	if a.Skill != "" {
+		scope = "skill " + a.Skill
+	}
+	printer.Info(fmt.Sprintf("Recorded %q verdict on %s (%s)", a.Outcome, scope, id))
+	return nil
+}
+
+// currentUserName returns the OS username for verdict provenance, or "unknown".
+func currentUserName() string {
+	if u, err := user.Current(); err == nil && u.Username != "" {
+		return u.Username
+	}
+	return "unknown"
+}
diff --git a/cmd/audit_annotate_test.go b/cmd/audit_annotate_test.go
new file mode 100644
index 0000000..a366955
--- /dev/null
+++ b/cmd/audit_annotate_test.go
@@ -0,0 +1,84 @@
+package cmd
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+)
+
+// TestAudit_AnnotateRoundTrip drives the full CLI: discover a seeded session,
+// record a human verdict on it, then read it back via `audit annotations`.
+func TestAudit_AnnotateRoundTrip(t *testing.T) {
+	home, _ := isolatedHome(t, true)
+	t.Setenv("HOME", home)
+	seedClaudeStore(t, home)
+
+	if _, stderr, err := runRoot(t, nil, "audit", "discover", "--agent", "claude", "--output", "json"); err != nil {
+		t.Fatalf("discover: err=%v stderr=%q", err, stderr)
+	}
+
+	// Grab the discovered session id.
+	sessionsOut, _, err := runRoot(t, nil, "audit", "sessions", "--output", "json")
+	if err != nil {
+		t.Fatalf("sessions: %v", err)
+	}
+	var sessions []struct {
+		SessionID string `json:"session_id"`
+	}
+	if e := json.Unmarshal([]byte(sessionsOut), &sessions); e != nil || len(sessions) == 0 {
+		t.Fatalf("decode sessions: %v\n%s", e, sessionsOut)
+	}
+	sid := sessions[0].SessionID
+
+	// Record a per-skill verdict.
+	out, stderr, err := runRoot(t, nil, "audit", "annotate", sid,
+		"--skill", "code-review", "--outcome", "bad", "--note", "missed the bug", "--output", "json")
+	if err != nil {
+		t.Fatalf("annotate: err=%v stderr=%q", err, stderr)
+	}
+	var wrote struct {
+		Outcome string `json:"outcome"`
+		Skill   string `json:"skill"`
+		Author  string `json:"author"`
+	}
+	if e := json.Unmarshal([]byte(out), &wrote); e != nil {
+		t.Fatalf("decode annotate json: %v\n%s", e, out)
+	}
+	if wrote.Outcome != "bad" || wrote.Skill != "code-review" || wrote.Author == "" {
+		t.Fatalf("annotate echo = %+v, want outcome=bad skill=code-review author!=\"\"", wrote)
+	}
+
+	// Read it back.
+	listOut, _, err := runRoot(t, nil, "audit", "annotations", "--skill", "code-review", "--output", "json")
+	if err != nil {
+		t.Fatalf("annotations list: %v", err)
+	}
+	if !strings.Contains(listOut, "missed the bug") || !strings.Contains(listOut, sid) {
+		t.Errorf("annotation not listed: %s", listOut)
+	}
+}
+
+// TestAudit_AnnotateRequiresOutcome pins the required --outcome flag.
+func TestAudit_AnnotateRequiresOutcome(t *testing.T) {
+	home, _ := isolatedHome(t, true)
+	t.Setenv("HOME", home)
+	if _, _, err := runRoot(t, nil, "audit", "annotate", "00000000-0000-0000-0000-000000000000"); err == nil {
+		t.Error("expected an error when --outcome is omitted")
+	}
+}
+
+// TestAudit_AnnotateUnknownSession rejects a verdict on a session that was
+// never captured.
+func TestAudit_AnnotateUnknownSession(t *testing.T) {
+	home, _ := isolatedHome(t, true)
+	t.Setenv("HOME", home)
+	seedClaudeStore(t, home)
+	if _, _, err := runRoot(t, nil, "audit", "discover", "--agent", "claude"); err != nil {
+		t.Fatalf("discover: %v", err)
+	}
+	_, _, err := runRoot(t, nil, "audit", "annotate",
+		"11111111-1111-1111-1111-111111111111", "--outcome", "bad")
+	if err == nil {
+		t.Error("expected an error annotating an unknown session")
+	}
+}
diff --git a/cmd/audit_annotations.go b/cmd/audit_annotations.go
new file mode 100644
index 0000000..e06a2ab
--- /dev/null
+++ b/cmd/audit_annotations.go
@@ -0,0 +1,106 @@
+package cmd
+
+import (
+	"fmt"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/google/uuid"
+	"github.com/spf13/cobra"
+)
+
+var (
+	annotationsSession string
+	annotationsSkill   string
+	annotationsSince   string
+)
+
+var auditAnnotationsCmd = &cobra.Command{
+	Use:   "annotations",
+	Short: "List human verdicts recorded on captured sessions",
+	Long: `Lists the human verdicts recorded with 'qvr audit annotate', newest-first.
+These are the outer-loop feedback signal a self-improvement loop reads to decide
+what to fix. Scope with --session and/or --skill.`,
+	Args: cobra.NoArgs,
+	RunE: runAuditAnnotations,
+}
+
+func init() {
+	auditAnnotationsCmd.Flags().StringVar(&annotationsSession, "session", "", "filter to one session id")
+	auditAnnotationsCmd.Flags().StringVar(&annotationsSkill, "skill", "", "filter to one skill")
+	auditAnnotationsCmd.Flags().StringVar(&annotationsSince, "since", "", "only verdicts since this time (e.g. 7d, 24h, or RFC3339)")
+	auditCmd.AddCommand(auditAnnotationsCmd)
+}
+
+func runAuditAnnotations(cmd *cobra.Command, args []string) error {
+	cfg, err := config.Load()
+	if err != nil {
+		return err
+	}
+	if !auditDBExists(cfg) {
+		if outputFormat == "json" {
+			return printer.JSON([]any{})
+		}
+		printer.Info("No annotations recorded yet")
+		return nil
+	}
+
+	f := &store.AnnotationFilter{Skill: annotationsSkill}
+	if annotationsSession != "" {
+		id, perr := uuid.Parse(annotationsSession)
+		if perr != nil {
+			return fmt.Errorf("invalid --session id %q: %w", annotationsSession, perr)
+		}
+		f.SessionID = &id
+	}
+	if annotationsSince != "" {
+		t, perr := parseTimeFlag(annotationsSince)
+		if perr != nil {
+			return fmt.Errorf("invalid --since: %w", perr)
+		}
+		f.Since = &t
+	}
+
+	s, err := openAuditStore(cmd.Context(), cfg, true)
+	if err != nil {
+		return fmt.Errorf("open audit store: %w", err)
+	}
+	defer s.Close()
+
+	rows, err := s.ListAnnotations(cmd.Context(), f)
+	if err != nil {
+		return fmt.Errorf("list annotations: %w", err)
+	}
+	return renderAnnotations(rows)
+}
+
+func renderAnnotations(rows []*store.AnnotationRow) error {
+	if outputFormat == "json" {
+		if len(rows) == 0 {
+			return printer.JSON([]any{})
+		}
+		return printer.JSON(rows)
+	}
+	if len(rows) == 0 {
+		printer.Info("No annotations recorded yet")
+		return nil
+	}
+	headers := []string{"CREATED", "OUTCOME", "SKILL", "AUTHOR", "NOTE", "SESSION ID"}
+	out := make([][]string, 0, len(rows))
+	for _, a := range rows {
+		skill := a.Skill
+		if skill == "" {
+			skill = "(session)"
+		}
+		out = append(out, []string{
+			a.CreatedAt.Local().Format("01-02 15:04"),
+			a.Outcome,
+			skill,
+			a.Author,
+			clipCell(a.Note, 48),
+			a.SessionID.String(),
+		})
+	}
+	printer.Table(headers, out)
+	return nil
+}
diff --git a/cmd/audit_test.go b/cmd/audit_test.go
index b5a4ed2..e990041 100644
--- a/cmd/audit_test.go
+++ b/cmd/audit_test.go
@@ -99,6 +99,12 @@ func runRoot(t *testing.T, stdin []byte, args ...string) (string, string, error)
 		ingestAgent, ingestSession, ingestCwd = "", "", ""
 		discoverAgents, discoverSince = nil, ""
 		discoverKeepAll, discoverDryRun = false, false
+		// SkillOps + annotation command flags (same persistence hazard).
+		evalSkillDir, evalSuite, evalSession = "", "", ""
+		annotateOutcome, annotateNote, annotateSkill, annotateAuthor = "", "", "", ""
+		annotationsSession, annotationsSkill, annotationsSince = "", "", ""
+		promoteReason, promoteSkillDir, lineageSince = "", "", ""
+		promoteForce = false
 	})
 	return stdout, stderr, err
 }
diff --git a/cmd/ops.go b/cmd/ops.go
new file mode 100644
index 0000000..49784c4
--- /dev/null
+++ b/cmd/ops.go
@@ -0,0 +1,126 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/eval"
+	"github.com/astra-sh/qvr/internal/model"
+	"github.com/spf13/cobra"
+)
+
+// opsCmd is the parent for SkillOps — evaluating skills as first-class
+// artifacts and gating their evolution on evidence. It builds on the audit
+// trace layer: `qvr ops eval` grades a skill's captured sessions against its
+// evals.yaml, and `qvr ops lineage` joins those verdicts to the spans by the
+// exact locked commit that ran.
+var opsCmd = &cobra.Command{
+	Use:   "ops",
+	Short: "[experimental] Evaluate skills and gate their evolution on evidence",
+	Long: `[EXPERIMENTAL] SkillOps treats a skill as a first-class artifact with an
+evals.yaml manifest (a sibling of SKILL.md). It grades the skill's CAPTURED
+sessions deterministically — pure Go over the spans qvr already derived, no
+model calls, no agent execution — and keys every verdict to the exact locked
+commit that ran. That is the evidence a self-improvement loop gates on: improve
+the skill, re-run it, and 'qvr ops eval' must show the score improve before the
+change ships.
+
+Subcommands: eval (grade a skill), lineage (verdicts over commits), promote
+(refuse to advance a skill without a passing eval).`,
+	RunE: rejectUnknownSubcommand,
+}
+
+func init() {
+	rootCmd.AddCommand(opsCmd)
+}
+
+// resolvedSkill carries everything the ops subcommands need about a target
+// skill: where its evals.yaml/SKILL.md live, and the locked commit that pins
+// every eval verdict (the lock × evidence join). Commit is "" for a skill with
+// no project lock entry (e.g. a local fixture not yet committed).
+type resolvedSkill struct {
+	Name   string
+	Dir    string
+	Commit string
+}
+
+// resolveSkill locates a skill's directory and locked commit. An explicit
+// --skill-dir wins (the evolve loop points it at the ejected, writable copy);
+// otherwise the installed project target dir is searched. The commit is read
+// from the project lock when present.
+func resolveSkill(name, skillDirFlag string) (*resolvedSkill, error) {
+	projectRoot, err := os.Getwd()
+	if err != nil {
+		return nil, fmt.Errorf("resolve cwd: %w", err)
+	}
+	out := &resolvedSkill{Name: name}
+
+	if skillDirFlag != "" {
+		out.Dir = skillDirFlag
+	} else {
+		dir, ok := installedSkillDir(projectRoot, name)
+		if !ok {
+			return nil, fmt.Errorf("could not find skill %q under any target dir — pass --skill-dir <path>", name)
+		}
+		out.Dir = dir
+	}
+	if _, err := os.Stat(filepath.Join(out.Dir, "SKILL.md")); err != nil {
+		return nil, fmt.Errorf("%s has no SKILL.md: %w", out.Dir, err)
+	}
+	out.Commit = lockedCommit(projectRoot, name)
+	return out, nil
+}
+
+// installedSkillDir searches the distinct project-relative target dirs for an
+// installed skill named name, returning the first that holds a SKILL.md. The
+// dirs are searched in sorted order so resolution is deterministic.
+func installedSkillDir(projectRoot, name string) (string, bool) {
+	seen := map[string]bool{}
+	var dirs []string
+	for _, t := range model.Targets {
+		if !seen[t.LocalDir] {
+			seen[t.LocalDir] = true
+			dirs = append(dirs, t.LocalDir)
+		}
+	}
+	sort.Strings(dirs)
+	for _, d := range dirs {
+		cand := filepath.Join(projectRoot, d, name)
+		if _, err := os.Stat(filepath.Join(cand, "SKILL.md")); err == nil {
+			return cand, true
+		}
+	}
+	return "", false
+}
+
+// lockedCommit reads the project lock for the skill's pinned commit, or "" when
+// there is no lock/entry (best-effort: a missing commit just leaves the eval
+// run keyed by an empty commit, still time-ordered for lineage).
+func lockedCommit(projectRoot, name string) string {
+	lockPath := model.DefaultLockPath(projectRoot, config.Dir(), false)
+	lock, err := model.ReadLockFile(lockPath)
+	if err != nil {
+		return ""
+	}
+	if e, ok := lock.Skills[name]; ok {
+		return e.Commit
+	}
+	return ""
+}
+
+// loadEvalSuiteNames is a small helper for diagnostics: the suite names a
+// skill's manifest defines, or nil.
+func loadEvalSuiteNames(dir string) []string {
+	man, err := eval.Load(dir)
+	if err != nil || man == nil {
+		return nil
+	}
+	names := make([]string, 0, len(man.Suites))
+	for _, s := range man.Suites {
+		names = append(names, s.Name)
+	}
+	return names
+}
diff --git a/cmd/ops_eval.go b/cmd/ops_eval.go
new file mode 100644
index 0000000..1d1c04b
--- /dev/null
+++ b/cmd/ops_eval.go
@@ -0,0 +1,174 @@
+package cmd
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/eval"
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/google/uuid"
+	"github.com/spf13/cobra"
+)
+
+var (
+	evalSkillDir string
+	evalSuite    string
+	evalSession  string
+)
+
+var opsEvalCmd = &cobra.Command{
+	Use:   "eval",
+	Short: "Grade a skill's captured sessions against its evals.yaml",
+	Long: `Grades a skill against its evals.yaml manifest, deterministically, over the
+sessions qvr already captured — no agent run, no model call. The verdict is
+recorded keyed by the skill's locked commit, so 'qvr ops lineage' can show the
+score move across versions.`,
+	RunE: rejectUnknownSubcommand,
+}
+
+var opsEvalRunCmd = &cobra.Command{
+	Use:   "run <skill>",
+	Short: "Run a skill's evals against its latest captured session",
+	Long: `Grades the skill's most recent captured session (or --session <id>) against
+the suite(s) in its evals.yaml, records the result keyed by the locked commit,
+and exits non-zero if any case fails — so a CI step can gate a change.
+
+By default the installed project skill dir is used; pass --skill-dir to evaluate
+a specific directory (e.g. the writable copy an evolution loop is editing).`,
+	Args: cobra.ExactArgs(1),
+	RunE: runOpsEval,
+}
+
+func init() {
+	opsEvalRunCmd.Flags().StringVar(&evalSkillDir, "skill-dir", "", "skill directory holding evals.yaml (default: the installed project skill)")
+	opsEvalRunCmd.Flags().StringVar(&evalSuite, "suite", "", "evaluate only this suite (default: all)")
+	opsEvalRunCmd.Flags().StringVar(&evalSession, "session", "", "grade this session id (default: the skill's most recent)")
+	opsEvalCmd.AddCommand(opsEvalRunCmd)
+	opsCmd.AddCommand(opsEvalCmd)
+}
+
+func runOpsEval(cmd *cobra.Command, args []string) error {
+	rs, err := resolveSkill(args[0], evalSkillDir)
+	if err != nil {
+		return err
+	}
+
+	cfg, err := config.Load()
+	if err != nil {
+		return err
+	}
+	if !auditDBExists(cfg) {
+		return fmt.Errorf("no audit database yet — run `qvr audit discover` first")
+	}
+	// Eval writes an eval_runs row, so open read-write (applies migration 0009).
+	s, err := openAuditStore(cmd.Context(), cfg, false)
+	if err != nil {
+		return fmt.Errorf("open audit store: %w", err)
+	}
+	defer s.Close()
+
+	in := eval.RunInput{SkillName: rs.Name, SkillDir: rs.Dir, Suite: evalSuite}
+	if evalSession != "" {
+		id, perr := uuid.Parse(evalSession)
+		if perr != nil {
+			return fmt.Errorf("invalid --session id %q: %w", evalSession, perr)
+		}
+		in.SessionID = &id
+	}
+
+	result, err := eval.Run(cmd.Context(), s, in)
+	if err != nil {
+		if names := loadEvalSuiteNames(rs.Dir); evalSuite != "" && len(names) > 0 {
+			return fmt.Errorf("%w (available suites: %s)", err, strings.Join(names, ", "))
+		}
+		return err
+	}
+
+	if _, err := s.PutEvalRun(cmd.Context(), evalRunRow(rs, result)); err != nil {
+		return fmt.Errorf("record eval run: %w", err)
+	}
+
+	if rerr := renderEvalResult(result); rerr != nil {
+		return rerr
+	}
+	// A failed gate exits non-zero so CI / the evolution loop can branch on it.
+	if !result.Pass {
+		return fmt.Errorf("eval failed: %d of %d cases failed for %s", result.Failed, result.Passed+result.Failed, rs.Name)
+	}
+	return nil
+}
+
+// evalRunRow flattens a run result into the persisted row + its case rows.
+func evalRunRow(rs *resolvedSkill, r *eval.RunResult) *store.EvalRunRow {
+	suite := evalSuite
+	if suite == "" {
+		suite = "*"
+	}
+	row := &store.EvalRunRow{
+		SkillName:   rs.Name,
+		SkillCommit: rs.Commit,
+		Suite:       suite,
+		SessionID:   r.SessionID,
+		Passed:      r.Passed,
+		Failed:      r.Failed,
+		Pass:        r.Pass,
+	}
+	for _, su := range r.Suites {
+		for _, c := range su.Cases {
+			row.Cases = append(row.Cases, store.EvalCaseRow{
+				Suite: su.Suite, Case: c.Case, Pass: c.Pass, Detail: caseDetail(c),
+			})
+		}
+	}
+	return row
+}
+
+// caseDetail joins the failing graders' details into a one-line reason.
+func caseDetail(c eval.CaseResult) string {
+	if c.Pass {
+		return ""
+	}
+	var parts []string
+	for _, g := range c.Graders {
+		if !g.Pass {
+			parts = append(parts, fmt.Sprintf("%s: %s", g.Type, g.Detail))
+		}
+	}
+	return strings.Join(parts, "; ")
+}
+
+func renderEvalResult(r *eval.RunResult) error {
+	if outputFormat == "json" {
+		return printer.JSON(r)
+	}
+	verdict := "PASS"
+	if !r.Pass {
+		verdict = "FAIL"
+	}
+	printer.Info(fmt.Sprintf("%s — %s: %d passed, %d failed (session %s)",
+		verdict, r.Skill, r.Passed, r.Failed, shortID(r.SessionID)))
+	headers := []string{"SUITE", "CASE", "RESULT", "DETAIL"}
+	rows := make([][]string, 0)
+	for _, su := range r.Suites {
+		for _, c := range su.Cases {
+			res := "pass"
+			if !c.Pass {
+				res = "FAIL"
+			}
+			rows = append(rows, []string{su.Suite, c.Case, res, clipCell(caseDetail(c), 60)})
+		}
+	}
+	if len(rows) > 0 {
+		printer.Table(headers, rows)
+	}
+	return nil
+}
+
+// shortID clips a session id to its first segment for compact display.
+func shortID(id string) string {
+	if len(id) >= 8 {
+		return id[:8]
+	}
+	return id
+}
diff --git a/cmd/ops_lineage.go b/cmd/ops_lineage.go
new file mode 100644
index 0000000..f99d1b1
--- /dev/null
+++ b/cmd/ops_lineage.go
@@ -0,0 +1,148 @@
+package cmd
+
+import (
+	"fmt"
+	"sort"
+	"time"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/spf13/cobra"
+)
+
+var lineageSince string
+
+var opsLineageCmd = &cobra.Command{
+	Use:   "lineage <skill>",
+	Short: "Show a skill's eval verdicts and human annotations over time",
+	Long: `Merges a skill's eval runs and human annotations into one time-ordered
+timeline, each eval keyed to the locked commit that was graded. This is how the
+self-improvement loop reads "at this commit the suite failed; after the fix it
+passed" — the lock × evidence join made legible.`,
+	Args: cobra.ExactArgs(1),
+	RunE: runOpsLineage,
+}
+
+func init() {
+	opsLineageCmd.Flags().StringVar(&lineageSince, "since", "", "only entries since this time (e.g. 30d, 24h, or RFC3339)")
+	opsCmd.AddCommand(opsLineageCmd)
+}
+
+// lineageEntry is one row of the merged timeline.
+type lineageEntry struct {
+	At      time.Time `json:"at"`
+	Kind    string    `json:"kind"` // "eval" or "annotation"
+	Commit  string    `json:"commit,omitempty"`
+	Detail  string    `json:"detail"`
+	Pass    *bool     `json:"pass,omitempty"`
+	Outcome string    `json:"outcome,omitempty"`
+}
+
+func runOpsLineage(cmd *cobra.Command, args []string) error {
+	name := args[0]
+	cfg, err := config.Load()
+	if err != nil {
+		return err
+	}
+	if !auditDBExists(cfg) {
+		if outputFormat == "json" {
+			return printer.JSON([]any{})
+		}
+		printer.Info("Nothing recorded yet")
+		return nil
+	}
+	s, err := openAuditStore(cmd.Context(), cfg, true)
+	if err != nil {
+		return fmt.Errorf("open audit store: %w", err)
+	}
+	defer s.Close()
+
+	var since *time.Time
+	if lineageSince != "" {
+		t, perr := parseTimeFlag(lineageSince)
+		if perr != nil {
+			return fmt.Errorf("invalid --since: %w", perr)
+		}
+		since = &t
+	}
+
+	runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: name, Since: since})
+	if err != nil {
+		return fmt.Errorf("list eval runs: %w", err)
+	}
+	anns, err := s.ListAnnotations(cmd.Context(), &store.AnnotationFilter{Skill: name, Since: since})
+	if err != nil {
+		return fmt.Errorf("list annotations: %w", err)
+	}
+
+	timeline := mergeLineage(runs, anns)
+	return renderLineage(timeline)
+}
+
+// mergeLineage interleaves eval runs and annotations into one newest-first
+// timeline.
+func mergeLineage(runs []*store.EvalRunRow, anns []*store.AnnotationRow) []lineageEntry {
+	var out []lineageEntry
+	for _, r := range runs {
+		pass := r.Pass
+		out = append(out, lineageEntry{
+			At: r.StartedAt, Kind: "eval", Commit: shortCommit(r.SkillCommit), Pass: &pass,
+			Detail: fmt.Sprintf("suite %s: %d passed, %d failed", r.Suite, r.Passed, r.Failed),
+		})
+	}
+	for _, a := range anns {
+		out = append(out, lineageEntry{
+			At: a.CreatedAt, Kind: "annotation", Outcome: a.Outcome,
+			Detail: annotationDetail(a),
+		})
+	}
+	sort.SliceStable(out, func(i, j int) bool { return out[i].At.After(out[j].At) })
+	return out
+}
+
+func annotationDetail(a *store.AnnotationRow) string {
+	if a.Note != "" {
+		return fmt.Sprintf("%s — %s", a.Outcome, a.Note)
+	}
+	return a.Outcome
+}
+
+func renderLineage(timeline []lineageEntry) error {
+	if outputFormat == "json" {
+		if len(timeline) == 0 {
+			return printer.JSON([]any{})
+		}
+		return printer.JSON(timeline)
+	}
+	if len(timeline) == 0 {
+		printer.Info("No eval runs or annotations for that skill yet")
+		return nil
+	}
+	headers := []string{"WHEN", "KIND", "COMMIT", "RESULT", "DETAIL"}
+	rows := make([][]string, 0, len(timeline))
+	for _, e := range timeline {
+		result := ""
+		switch {
+		case e.Pass != nil && *e.Pass:
+			result = "pass"
+		case e.Pass != nil:
+			result = "FAIL"
+		case e.Outcome != "":
+			result = e.Outcome
+		}
+		rows = append(rows, []string{
+			e.At.Local().Format("01-02 15:04"),
+			e.Kind, e.Commit, result, clipCell(e.Detail, 56),
+		})
+	}
+	printer.Table(headers, rows)
+	return nil
+}
+
+// shortCommit clips a git SHA to 7 chars, leaving "" (no lock) untouched.
+func shortCommit(c string) string {
+	if len(c) >= 7 {
+		return c[:7]
+	}
+	return c
+}
diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go
new file mode 100644
index 0000000..97eca56
--- /dev/null
+++ b/cmd/ops_promote.go
@@ -0,0 +1,117 @@
+package cmd
+
+import (
+	"fmt"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/spf13/cobra"
+)
+
+var (
+	promoteReason   string
+	promoteForce    bool
+	promoteSkillDir string
+)
+
+var opsPromoteCmd = &cobra.Command{
+	Use:   "promote <skill>",
+	Short: "Gate a skill's advancement on a passing eval",
+	Long: `Refuses to advance a skill whose currently-locked commit has no passing eval
+run, unless --force-no-eval is given. This is the evidence gate the
+self-improvement loop ends on: a drafted improvement only ships once
+'qvr ops eval run' has recorded a pass for that exact commit.
+
+It is a check, not a state change: it reports whether the locked commit is
+backed by evidence, so a loop or CI step can branch on the exit code.`,
+	Args: cobra.ExactArgs(1),
+	RunE: runOpsPromote,
+}
+
+func init() {
+	opsPromoteCmd.Flags().StringVar(&promoteReason, "reason", "", "why the skill is being promoted (recorded in the message)")
+	opsPromoteCmd.Flags().BoolVar(&promoteForce, "force-no-eval", false, "promote even without a passing eval (records the override)")
+	opsPromoteCmd.Flags().StringVar(&promoteSkillDir, "skill-dir", "", "skill directory (default: the installed project skill)")
+	opsCmd.AddCommand(opsPromoteCmd)
+}
+
+func runOpsPromote(cmd *cobra.Command, args []string) error {
+	rs, err := resolveSkill(args[0], promoteSkillDir)
+	if err != nil {
+		return err
+	}
+
+	cfg, err := config.Load()
+	if err != nil {
+		return err
+	}
+	if !auditDBExists(cfg) {
+		return fmt.Errorf("no audit database yet — run `qvr audit discover` first")
+	}
+	s, err := openAuditStore(cmd.Context(), cfg, true)
+	if err != nil {
+		return fmt.Errorf("open audit store: %w", err)
+	}
+	defer s.Close()
+
+	passing := latestPassingEval(cmd, s, rs)
+	decision := promoteDecision(rs, passing)
+
+	if outputFormat == "json" {
+		if err := printer.JSON(decision); err != nil {
+			return err
+		}
+	} else {
+		printer.Info(decision.Message)
+	}
+	if !decision.Promoted {
+		return fmt.Errorf("refusing to promote %s: no passing eval for commit %s (run `qvr ops eval run %s`, or pass --force-no-eval)",
+			rs.Name, shortCommit(rs.Commit), rs.Name)
+	}
+	return nil
+}
+
+// promoteDecisionResult is the JSON/text shape of a promote check.
+type promoteDecisionResult struct {
+	Skill    string `json:"skill"`
+	Commit   string `json:"commit,omitempty"`
+	Promoted bool   `json:"promoted"`
+	Forced   bool   `json:"forced,omitempty"`
+	Reason   string `json:"reason,omitempty"`
+	Message  string `json:"message"`
+}
+
+func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow) promoteDecisionResult {
+	d := promoteDecisionResult{Skill: rs.Name, Commit: shortCommit(rs.Commit), Reason: promoteReason}
+	switch {
+	case passing != nil:
+		d.Promoted = true
+		d.Message = fmt.Sprintf("%s @ %s is backed by a passing eval (run #%d) — clear to promote", rs.Name, shortCommit(rs.Commit), passing.ID)
+	case promoteForce:
+		d.Promoted = true
+		d.Forced = true
+		d.Message = fmt.Sprintf("%s @ %s promoted WITHOUT a passing eval (--force-no-eval)", rs.Name, shortCommit(rs.Commit))
+	default:
+		d.Message = fmt.Sprintf("%s @ %s has no passing eval", rs.Name, shortCommit(rs.Commit))
+	}
+	return d
+}
+
+// latestPassingEval returns the newest passing eval run for the skill's locked
+// commit, or nil. A skill with no locked commit ("") can't be evidence-gated by
+// commit, so this returns nil (promotion then requires --force-no-eval).
+func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) *store.EvalRunRow {
+	if rs.Commit == "" {
+		return nil
+	}
+	runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit})
+	if err != nil {
+		return nil
+	}
+	for _, r := range runs { // newest-first
+		if r.Pass {
+			return r
+		}
+	}
+	return nil
+}
diff --git a/cmd/ops_test.go b/cmd/ops_test.go
new file mode 100644
index 0000000..fb37894
--- /dev/null
+++ b/cmd/ops_test.go
@@ -0,0 +1,126 @@
+package cmd
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+
+	"github.com/astra-sh/qvr/internal/config"
+	"github.com/astra-sh/qvr/internal/ops/store"
+)
+
+// TestOpsEval_NoDatabase pins the helpful error when nothing has been captured.
+func TestOpsEval_NoDatabase(t *testing.T) {
+	isolatedHome(t, true)
+	dir := writeGuardTestsFixture(t)
+	_, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", "--skill-dir", dir)
+	if err == nil || !strings.Contains(err.Error(), "audit database") {
+		t.Errorf("want a 'no audit database' error, got %v", err)
+	}
+}
+
+// TestOpsEval_UnknownSkillDir rejects a --skill-dir with no SKILL.md.
+func TestOpsEval_UnknownSkillDir(t *testing.T) {
+	isolatedHome(t, true)
+	_, _, err := runRoot(t, nil, "ops", "eval", "run", "ghost", "--skill-dir", t.TempDir())
+	if err == nil || !strings.Contains(err.Error(), "SKILL.md") {
+		t.Errorf("want a missing-SKILL.md error, got %v", err)
+	}
+}
+
+// TestOpsEval_UnknownSuite reports the available suites.
+func TestOpsEval_UnknownSuite(t *testing.T) {
+	isolatedHome(t, true)
+	cfg, _ := config.Load()
+	dir := writeGuardTestsFixture(t)
+	seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "guard-tests", Outcome: "success", Tools: []string{"Bash"}})
+
+	_, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", "--skill-dir", dir, "--suite", "nope")
+	if err == nil || !strings.Contains(err.Error(), "verifies-changes") {
+		t.Errorf("want an error listing available suites, got %v", err)
+	}
+}
+
+// TestOpsEval_NoSessions errors helpfully when the skill has no captured run.
+func TestOpsEval_NoSessions(t *testing.T) {
+	isolatedHome(t, true)
+	cfg, _ := config.Load()
+	dir := writeGuardTestsFixture(t)
+	// Create the DB (a session for a DIFFERENT skill) so the DB exists but the
+	// target skill has no runs.
+	seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "other-skill", Outcome: "success", Tools: []string{"Bash"}})
+
+	_, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests", "--skill-dir", dir)
+	if err == nil || !strings.Contains(err.Error(), "no captured sessions") {
+		t.Errorf("want a 'no captured sessions' error, got %v", err)
+	}
+}
+
+// TestOpsPromote_Gate proves the evidence gate: without a passing eval the
+// promote refuses (non-zero), and --force-no-eval overrides it.
+func TestOpsPromote_Gate(t *testing.T) {
+	isolatedHome(t, true)
+	cfg, _ := config.Load()
+	dir := writeGuardTestsFixture(t)
+	seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "guard-tests", Outcome: "success", Tools: []string{"Bash"}})
+
+	// No eval recorded yet → refuse.
+	if _, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir); err == nil {
+		t.Error("expected promote to refuse without a passing eval")
+	}
+
+	// Forced → allowed, and the JSON records the override.
+	out, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir, "--force-no-eval", "--output", "json")
+	if err != nil {
+		t.Fatalf("forced promote: %v", err)
+	}
+	var d struct {
+		Promoted bool `json:"promoted"`
+		Forced   bool `json:"forced"`
+	}
+	if e := json.Unmarshal([]byte(out), &d); e != nil {
+		t.Fatalf("decode: %v\n%s", e, out)
+	}
+	if !d.Promoted || !d.Forced {
+		t.Errorf("forced promote = %+v, want promoted+forced", d)
+	}
+}
+
+// TestPromoteDecision unit-tests the pure gate logic across its three branches.
+func TestPromoteDecision(t *testing.T) {
+	rs := &resolvedSkill{Name: "guard-tests", Commit: "abc1234"}
+
+	t.Run("passing eval clears it", func(t *testing.T) {
+		promoteForce = false
+		d := promoteDecision(rs, &store.EvalRunRow{ID: 7, Pass: true})
+		if !d.Promoted || d.Forced {
+			t.Errorf("want promoted, not forced: %+v", d)
+		}
+	})
+	t.Run("no eval refuses", func(t *testing.T) {
+		promoteForce = false
+		if d := promoteDecision(rs, nil); d.Promoted {
+			t.Errorf("want refusal, got %+v", d)
+		}
+	})
+	t.Run("force overrides", func(t *testing.T) {
+		promoteForce = true
+		d := promoteDecision(rs, nil)
+		if !d.Promoted || !d.Forced {
+			t.Errorf("want forced promote: %+v", d)
+		}
+		promoteForce = false
+	})
+}
+
+// TestOpsLineage_Empty pins the no-data path: empty JSON, exit 0.
+func TestOpsLineage_Empty(t *testing.T) {
+	isolatedHome(t, true)
+	out, _, err := runRoot(t, nil, "ops", "lineage", "guard-tests", "--output", "json")
+	if err != nil {
+		t.Fatalf("lineage on empty: %v", err)
+	}
+	if strings.TrimSpace(out) != "[]" {
+		t.Errorf("want empty array, got %q", out)
+	}
+}
diff --git a/internal/eval/eval_test.go b/internal/eval/eval_test.go
new file mode 100644
index 0000000..489a590
--- /dev/null
+++ b/internal/eval/eval_test.go
@@ -0,0 +1,217 @@
+package eval
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/google/uuid"
+)
+
+// TestParse_RejectsNoOpGraders pins the hardening rule: a grader with no real
+// assertion (a gate that can never fail) is a config error, not a silent pass.
+func TestParse_RejectsNoOpGraders(t *testing.T) {
+	cases := []struct {
+		name    string
+		grader  string
+		wantErr bool
+	}{
+		{"outcome ok", "type: outcome\n            expect: success", false},
+		{"outcome no expect", "type: outcome", true},
+		{"text ok", "type: text\n            contains: [\"x\"]", false},
+		{"text empty", "type: text", true},
+		{"sequence empty", "type: tool_sequence", true},
+		{"constraint empty", "type: tool_constraint", true},
+		{"constraint maxTools ok", "type: tool_constraint\n            maxTools: 5", false},
+		{"skill_invocation empty", "type: skill_invocation", true},
+		{"behavior empty", "type: behavior", true},
+		{"behavior maxTurns ok", "type: behavior\n            maxTurns: 3", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			y := "version: 1\nsuites:\n  - name: s\n    cases:\n      - name: c\n        graders:\n          - " + tc.grader + "\n"
+			_, err := Parse([]byte(y))
+			if (err != nil) != tc.wantErr {
+				t.Errorf("Parse err=%v, wantErr=%v\nyaml:\n%s", err, tc.wantErr, y)
+			}
+		})
+	}
+}
+
+// TestLoad_AbsentManifestIsNil pins the contract: a skill with no evals.yaml is
+// not an error (it just has no evals), while malformed YAML is.
+func TestLoad_AbsentManifestIsNil(t *testing.T) {
+	dir := t.TempDir()
+	m, err := Load(dir)
+	if err != nil || m != nil {
+		t.Errorf("absent manifest: got (%v, %v), want (nil, nil)", m, err)
+	}
+
+	if err := os.WriteFile(filepath.Join(dir, ManifestFile), []byte("version: 1\nsuites: [oops\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := Load(dir); err == nil {
+		t.Error("expected an error for malformed evals.yaml")
+	}
+}
+
+// TestBuildEvidence distils a captured session into grader input: the final
+// assistant message and the tool sequence (ordered by start time, even when the
+// store hands spans back out of order).
+func TestBuildEvidence(t *testing.T) {
+	sid := uuid.New()
+	meta := &store.SessionMetaRow{
+		SessionID: sid, Outcome: "success", Skills: []string{"triage-issue"},
+		Tools: 2, Turns: 1, StartedMs: 1000, EndedMs: 2500,
+	}
+	// Spans deliberately out of order; the second tool starts later.
+	spans := []*store.SpanRow{
+		{Kind: "TOOL", StartMs: 1200, Attributes: `{"gen_ai.tool.name":"Edit"}`},
+		{Kind: "LLM", StartMs: 1000, Attributes: `{"gen_ai.output.messages":"[{\"role\":\"assistant\",\"content\":\"Labeled needs-info\"}]"}`},
+		{Kind: "TOOL", StartMs: 1100, Attributes: `{"gen_ai.tool.name":"Read"}`},
+	}
+	e := BuildEvidence(meta, spans)
+	if e.Outcome != "success" || e.FinalMessage != "Labeled needs-info" {
+		t.Errorf("evidence basics wrong: %+v", e)
+	}
+	if len(e.ToolSequence) != 2 || e.ToolSequence[0] != "Read" || e.ToolSequence[1] != "Edit" {
+		t.Errorf("tool sequence not ordered by start time: %v", e.ToolSequence)
+	}
+	if e.DurationMs != 1500 {
+		t.Errorf("duration = %d, want 1500", e.DurationMs)
+	}
+}
+
+// TestBuildEvidence_ToleratesGarbage pins robustness: malformed or empty span
+// attribute blobs must degrade gracefully (empty fields), never panic.
+func TestBuildEvidence_ToleratesGarbage(t *testing.T) {
+	meta := &store.SessionMetaRow{SessionID: uuid.New()}
+	spans := []*store.SpanRow{
+		{Kind: "TOOL", StartMs: 1, Attributes: "not json{"},
+		{Kind: "LLM", StartMs: 2, Attributes: ""},
+		{Kind: "TOOL", StartMs: 3, Attributes: `{"gen_ai.tool.name":"Read"}`},
+	}
+	e := BuildEvidence(meta, spans)
+	if e.FinalMessage != "" {
+		t.Errorf("want empty final message from garbage, got %q", e.FinalMessage)
+	}
+	if len(e.ToolSequence) != 1 || e.ToolSequence[0] != "Read" {
+		t.Errorf("want only the well-formed tool, got %v", e.ToolSequence)
+	}
+}
+
+func TestParse_ValidatesVersionAndStructure(t *testing.T) {
+	cases := []struct {
+		name    string
+		yaml    string
+		wantErr bool
+	}{
+		{"good", `
+version: 1
+suites:
+  - name: triage-correctness
+    cases:
+      - name: needs-info
+        graders:
+          - type: text
+            contains: ["needs-info"]
+`, false},
+		{"bad version", "version: 2\nsuites: []\n", true},
+		{"no suites", "version: 1\nsuites: []\n", true},
+		{"suite no cases", "version: 1\nsuites:\n  - name: s\n    cases: []\n", true},
+		{"unknown grader", `
+version: 1
+suites:
+  - name: s
+    cases:
+      - name: c
+        graders:
+          - type: prompt
+`, true},
+		{"case no graders", `
+version: 1
+suites:
+  - name: s
+    cases:
+      - name: c
+        graders: []
+`, true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			_, err := Parse([]byte(tc.yaml))
+			if (err != nil) != tc.wantErr {
+				t.Errorf("Parse err=%v, wantErr=%v", err, tc.wantErr)
+			}
+		})
+	}
+}
+
+func ev() *Evidence {
+	return &Evidence{
+		SessionID:    "s1",
+		Skill:        "triage-issue",
+		Outcome:      "success",
+		FinalMessage: "Labeled the issue needs-info because the requirement is ambiguous.",
+		ToolSequence: []string{"Read", "Bash", "Edit"},
+		Skills:       []string{"triage-issue"},
+		Tools:        3,
+		Turns:        2,
+		DurationMs:   1500,
+	}
+}
+
+func TestGraders(t *testing.T) {
+	cases := []struct {
+		name string
+		spec GraderSpec
+		want bool
+	}{
+		{"outcome pass", GraderSpec{Type: "outcome", Expect: "success"}, true},
+		{"outcome fail", GraderSpec{Type: "outcome", Expect: "failure"}, false},
+		{"text contains pass", GraderSpec{Type: "text", Contains: []string{"needs-info"}}, true},
+		{"text contains fail", GraderSpec{Type: "text", Contains: []string{"ready-to-implement"}}, false},
+		{"text reject pass", GraderSpec{Type: "text", Reject: []string{"ready-to-implement"}}, true},
+		{"text reject fail", GraderSpec{Type: "text", Reject: []string{"needs-info"}}, false},
+		{"sequence pass", GraderSpec{Type: "tool_sequence", Sequence: []string{"Read", "Edit"}}, true},
+		{"sequence fail (order)", GraderSpec{Type: "tool_sequence", Sequence: []string{"Edit", "Read"}}, false},
+		{"constraint expect pass", GraderSpec{Type: "tool_constraint", ExpectTools: []string{"Bash"}}, true},
+		{"constraint reject fail", GraderSpec{Type: "tool_constraint", RejectTools: []string{"Bash"}}, false},
+		{"constraint max fail", GraderSpec{Type: "tool_constraint", MaxTools: 2}, false},
+		{"skill expect pass", GraderSpec{Type: "skill_invocation", ExpectSkills: []string{"triage-issue"}}, true},
+		{"skill reject fail", GraderSpec{Type: "skill_invocation", RejectSkills: []string{"triage-issue"}}, false},
+		{"behavior pass", GraderSpec{Type: "behavior", MaxTools: 5, MaxTurns: 3}, true},
+		{"behavior fail turns", GraderSpec{Type: "behavior", MaxTurns: 1}, false},
+		{"behavior fail duration", GraderSpec{Type: "behavior", MaxDurationMs: 1000}, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := runGrader(tc.spec, ev())
+			if got.Pass != tc.want {
+				t.Errorf("grader %s: pass=%v, want %v (detail=%q)", tc.spec.Type, got.Pass, tc.want, got.Detail)
+			}
+		})
+	}
+}
+
+// TestGradeSuite_AllMustPass pins the AND semantics: one failing grader fails
+// the case, and one failing case fails the suite.
+func TestGradeSuite_AllMustPass(t *testing.T) {
+	s := &Suite{Name: "triage-correctness", Cases: []Case{
+		{Name: "needs-info", Graders: []GraderSpec{
+			{Type: "outcome", Expect: "success"},
+			{Type: "text", Contains: []string{"needs-info"}, Reject: []string{"ready-to-implement"}},
+		}},
+	}}
+	if r := GradeSuite(s, ev()); !r.Pass || r.Passed != 1 || r.Failed != 0 {
+		t.Errorf("expected suite pass, got %+v", r)
+	}
+
+	// Flip the evidence to the pre-improvement (misclassified) run.
+	bad := ev()
+	bad.FinalMessage = "Labeled the issue ready-to-implement."
+	if r := GradeSuite(s, bad); r.Pass || r.Failed != 1 {
+		t.Errorf("expected suite fail on misclassification, got %+v", r)
+	}
+}
diff --git a/internal/eval/evidence.go b/internal/eval/evidence.go
new file mode 100644
index 0000000..11a4cdd
--- /dev/null
+++ b/internal/eval/evidence.go
@@ -0,0 +1,95 @@
+package eval
+
+import (
+	"encoding/json"
+	"slices"
+	"sort"
+
+	"github.com/astra-sh/qvr/internal/ops/derive"
+	"github.com/astra-sh/qvr/internal/ops/store"
+)
+
+// Evidence is the read model the graders judge: everything a deterministic
+// grader needs about one captured session, distilled from its derived spans and
+// session_meta. It deliberately carries no model output beyond the final
+// assistant message — graders read behavior (tools, skills, outcome), not
+// prose, except for the simple substring checks the text grader runs.
+type Evidence struct {
+	SessionID    string   `json:"session_id"`
+	Skill        string   `json:"skill,omitempty"`
+	Outcome      string   `json:"outcome,omitempty"` // session-level qvr.outcome rollup
+	FinalMessage string   `json:"final_message,omitempty"`
+	ToolSequence []string `json:"tool_sequence,omitempty"` // ordered tool names
+	Skills       []string `json:"skills,omitempty"`        // skills that fired
+	Tools        int      `json:"tools"`
+	Turns        int      `json:"turns"`
+	DurationMs   int64    `json:"duration_ms"`
+}
+
+// BuildEvidence distils a captured session into grader input. spans need not be
+// pre-sorted; the tool sequence is ordered by span start time so an
+// out-of-order store read can't scramble it.
+func BuildEvidence(meta *store.SessionMetaRow, spans []*store.SpanRow) *Evidence {
+	e := &Evidence{
+		SessionID:  meta.SessionID.String(),
+		Outcome:    meta.Outcome,
+		Skills:     append([]string(nil), meta.Skills...),
+		Tools:      int(meta.Tools),
+		Turns:      int(meta.Turns),
+		DurationMs: meta.DurationMs(),
+	}
+
+	ordered := append([]*store.SpanRow(nil), spans...)
+	sort.SliceStable(ordered, func(i, j int) bool { return ordered[i].StartMs < ordered[j].StartMs })
+
+	var lastLLMText string
+	for _, sp := range ordered {
+		attrs := decodeAttrs(sp.Attributes)
+		switch sp.Kind {
+		case derive.KindTool, derive.KindSkill:
+			if name, _ := attrs["gen_ai.tool.name"].(string); name != "" {
+				e.ToolSequence = append(e.ToolSequence, name)
+			}
+		case derive.KindLLM:
+			if msg := lastAssistantText(attrs); msg != "" {
+				lastLLMText = msg
+			}
+		}
+	}
+	e.FinalMessage = lastLLMText
+	return e
+}
+
+// decodeAttrs parses a span's JSON attribute blob, or returns an empty map.
+func decodeAttrs(raw string) map[string]any {
+	if raw == "" {
+		return map[string]any{}
+	}
+	var m map[string]any
+	if err := json.Unmarshal([]byte(raw), &m); err != nil {
+		return map[string]any{}
+	}
+	return m
+}
+
+// lastAssistantText pulls the assistant message text out of an LLM span's
+// gen_ai.output.messages attribute (a JSON-encoded [{role,content}] array).
+func lastAssistantText(attrs map[string]any) string {
+	raw, _ := attrs["gen_ai.output.messages"].(string)
+	if raw == "" {
+		return ""
+	}
+	var msgs []struct {
+		Role    string `json:"role"`
+		Content string `json:"content"`
+	}
+	if err := json.Unmarshal([]byte(raw), &msgs); err != nil {
+		return ""
+	}
+	for _, m := range slices.Backward(msgs) {
+		if m.Content != "" {
+			return m.Content
+		}
+	}
+	return ""
+}
diff --git a/internal/eval/grade.go b/internal/eval/grade.go
new file mode 100644
index 0000000..cf968b0
--- /dev/null
+++ b/internal/eval/grade.go
@@ -0,0 +1,50 @@
+package eval
+
+// CaseResult is the verdict on one case: it passes only when every grader
+// passed. The per-grader results are retained for diagnostics.
+type CaseResult struct {
+	Case    string         `json:"case"`
+	Pass    bool           `json:"pass"`
+	Graders []GraderResult `json:"graders"`
+}
+
+// SuiteResult aggregates a suite's case verdicts.
+type SuiteResult struct {
+	Suite  string       `json:"suite"`
+	Pass   bool         `json:"pass"`
+	Passed int          `json:"passed"`
+	Failed int          `json:"failed"`
+	Cases  []CaseResult `json:"cases"`
+}
+
+// GradeCase runs every grader in a case against the evidence. The case passes
+// only when all of them pass (AND semantics — a gate is as strong as its
+// strictest check).
+func GradeCase(c Case, e *Evidence) CaseResult {
+	r := CaseResult{Case: c.Name, Pass: true}
+	for _, g := range c.Graders {
+		gr := runGrader(g, e)
+		r.Graders = append(r.Graders, gr)
+		if !gr.Pass {
+			r.Pass = false
+		}
+	}
+	return r
+}
+
+// GradeSuite grades every case in a suite against the same evidence and rolls
+// up the pass/fail counts. The suite passes only when every case passes.
+func GradeSuite(s *Suite, e *Evidence) SuiteResult {
+	res := SuiteResult{Suite: s.Name, Pass: true}
+	for _, c := range s.Cases {
+		cr := GradeCase(c, e)
+		res.Cases = append(res.Cases, cr)
+		if cr.Pass {
+			res.Passed++
+		} else {
+			res.Failed++
+			res.Pass = false
+		}
+	}
+	return res
+}
diff --git a/internal/eval/graders.go b/internal/eval/graders.go
new file mode 100644
index 0000000..f1bfa7d
--- /dev/null
+++ b/internal/eval/graders.go
@@ -0,0 +1,205 @@
+package eval
+
+import (
+	"fmt"
+	"strings"
+)
+
+// GraderResult is one grader's verdict on one case.
+type GraderResult struct {
+	Type   string `json:"type"`
+	Name   string `json:"name,omitempty"`
+	Pass   bool   `json:"pass"`
+	Detail string `json:"detail,omitempty"`
+}
+
+// graderFunc judges evidence against a spec. Every grader is pure: same
+// (spec, evidence) → same result, so an eval run is reproducible.
+type graderFunc func(GraderSpec, *Evidence) GraderResult
+
+// graders is the registry of deterministic, trace-based graders. Each reads the
+// spans/meta qvr already captured — none runs an agent or calls a model. New
+// grader types register here; the LLM-judge ("prompt") grader is deliberately
+// NOT here — semantic judgement is layered on top as a skill, keeping the core
+// gate model-free.
+var graders = map[string]graderFunc{
+	"outcome":          gradeOutcome,
+	"text":             gradeText,
+	"tool_sequence":    gradeToolSequence,
+	"tool_constraint":  gradeToolConstraint,
+	"skill_invocation": gradeSkillInvocation,
+	"behavior":         gradeBehavior,
+}
+
+// runGrader dispatches one grader spec. An unknown type fails closed (a gate
+// that silently skips an unrecognized grader is worse than no gate); Load
+// already rejects these, so this is the belt-and-braces path.
+func runGrader(g GraderSpec, e *Evidence) GraderResult {
+	fn, ok := graders[g.Type]
+	if !ok {
+		return GraderResult{Type: g.Type, Name: g.Name, Pass: false, Detail: "unknown grader type"}
+	}
+	return fn(g, e)
+}
+
+// graderValidators rejects a grader whose config would make it a no-op — a gate
+// that can never fail is worse than no gate, because it reads as evidence while
+// asserting nothing. Each grader must carry at least one real assertion. Kept as
+// a registry (parallel to `graders`) so each check stays tiny.
+var graderValidators = map[string]func(GraderSpec) error{
+	"outcome": func(g GraderSpec) error {
+		return need(g.Expect != "", "needs `expect`")
+	},
+	"text": func(g GraderSpec) error {
+		return need(len(g.Contains) > 0 || len(g.Reject) > 0, "needs `contains` or `reject`")
+	},
+	"tool_sequence": func(g GraderSpec) error {
+		return need(len(g.Sequence) > 0, "needs `sequence`")
+	},
+	"tool_constraint": func(g GraderSpec) error {
+		return need(len(g.ExpectTools) > 0 || len(g.RejectTools) > 0 || g.MaxTools > 0,
+			"needs `expectTools`, `rejectTools`, or `maxTools`")
+	},
+	"skill_invocation": func(g GraderSpec) error {
+		return need(len(g.ExpectSkills) > 0 || len(g.RejectSkills) > 0, "needs `expectSkills` or `rejectSkills`")
+	},
+	"behavior": func(g GraderSpec) error {
+		return need(g.MaxTools > 0 || g.MaxTurns > 0 || g.MaxDurationMs > 0,
+			"needs `maxTools`, `maxTurns`, or `maxDurationMs`")
+	},
+}
+
+// validateGraderConfig runs the registered config check for a grader type.
+func validateGraderConfig(g GraderSpec) error {
+	if v, ok := graderValidators[g.Type]; ok {
+		return v(g)
+	}
+	return nil
+}
+
+// need returns an error with msg unless ok.
+func need(ok bool, msg string) error {
+	if ok {
+		return nil
+	}
+	return fmt.Errorf("%s", msg)
+}
+
+func gradeOutcome(g GraderSpec, e *Evidence) GraderResult {
+	pass := e.Outcome == g.Expect
+	detail := ""
+	if !pass {
+		got := e.Outcome
+		if got == "" {
+			got = "unknown"
+		}
+		detail = fmt.Sprintf("outcome=%s, want %s", got, g.Expect)
+	}
+	return GraderResult{Type: g.Type, Name: g.Name, Pass: pass, Detail: detail}
+}
+
+func gradeText(g GraderSpec, e *Evidence) GraderResult {
+	hay := evidenceField(g.On, e)
+	for _, want := range g.Contains {
+		if !strings.Contains(hay, want) {
+			return fail(g, fmt.Sprintf("missing %q", want))
+		}
+	}
+	for _, bad := range g.Reject {
+		if strings.Contains(hay, bad) {
+			return fail(g, fmt.Sprintf("contains rejected %q", bad))
+		}
+	}
+	return pass(g)
+}
+
+// evidenceField selects the text body a text grader reads. final_message is the
+// default and only field today; an unknown name falls back to it.
+func evidenceField(name string, e *Evidence) string {
+	switch name {
+	case "", "final_message":
+		return e.FinalMessage
+	default:
+		return e.FinalMessage
+	}
+}
+
+func gradeToolSequence(g GraderSpec, e *Evidence) GraderResult {
+	if isSubsequence(g.Sequence, e.ToolSequence) {
+		return pass(g)
+	}
+	return fail(g, fmt.Sprintf("sequence %v not found in %v", g.Sequence, e.ToolSequence))
+}
+
+func gradeToolConstraint(g GraderSpec, e *Evidence) GraderResult {
+	used := toSet(e.ToolSequence)
+	for _, want := range g.ExpectTools {
+		if !used[want] {
+			return fail(g, fmt.Sprintf("expected tool %q never used", want))
+		}
+	}
+	for _, bad := range g.RejectTools {
+		if used[bad] {
+			return fail(g, fmt.Sprintf("rejected tool %q was used", bad))
+		}
+	}
+	if g.MaxTools > 0 && len(e.ToolSequence) > g.MaxTools {
+		return fail(g, fmt.Sprintf("%d tool calls exceed max %d", len(e.ToolSequence), g.MaxTools))
+	}
+	return pass(g)
+}
+
+func gradeSkillInvocation(g GraderSpec, e *Evidence) GraderResult {
+	fired := toSet(e.Skills)
+	for _, want := range g.ExpectSkills {
+		if !fired[want] {
+			return fail(g, fmt.Sprintf("expected skill %q never fired", want))
+		}
+	}
+	for _, bad := range g.RejectSkills {
+		if fired[bad] {
+			return fail(g, fmt.Sprintf("rejected skill %q fired", bad))
+		}
+	}
+	return pass(g)
+}
+
+func gradeBehavior(g GraderSpec, e *Evidence) GraderResult {
+	if g.MaxTools > 0 && e.Tools > g.MaxTools {
+		return fail(g, fmt.Sprintf("%d tools exceed max %d", e.Tools, g.MaxTools))
+	}
+	if g.MaxTurns > 0 && e.Turns > g.MaxTurns {
+		return fail(g, fmt.Sprintf("%d turns exceed max %d", e.Turns, g.MaxTurns))
+	}
+	if g.MaxDurationMs > 0 && e.DurationMs > g.MaxDurationMs {
+		return fail(g, fmt.Sprintf("%dms exceeds max %dms", e.DurationMs, g.MaxDurationMs))
+	}
+	return pass(g)
+}
+
+// --- small helpers ---
+
+func pass(g GraderSpec) GraderResult { return GraderResult{Type: g.Type, Name: g.Name, Pass: true} }
+func fail(g GraderSpec, detail string) GraderResult {
+	return GraderResult{Type: g.Type, Name: g.Name, Pass: false, Detail: detail}
+}
+
+func toSet(items []string) map[string]bool {
+	s := make(map[string]bool, len(items))
+	for _, it := range items {
+		s[it] = true
+	}
+	return s
+}
+
+// isSubsequence reports whether want appears in order (not necessarily
+// contiguously) within have. An empty want trivially matches.
+func isSubsequence(want, have []string) bool {
+	i := 0
+	for _, h := range have {
+		if i < len(want) && h == want[i] {
+			i++
+		}
+	}
+	return i == len(want)
+}
diff --git a/internal/eval/manifest.go b/internal/eval/manifest.go
new file mode 100644
index 0000000..f7f3de4
--- /dev/null
+++ b/internal/eval/manifest.go
@@ -0,0 +1,157 @@
+// Package eval is qvr's deterministic, trace-based skill-evaluation substrate:
+// it reads an evals.yaml manifest (a sibling of SKILL.md, qvr's own format — not
+// part of the agentskills.io spec) and grades a skill's CAPTURED sessions
+// against it. The grading is pure Go over the spans qvr already derived from
+// real usage — no model calls, no agent execution, no sandbox replay. That keeps
+// the gate inside qvr's "uv for skills" core; semantic LLM-judge grading and
+// running a candidate skill against tasks are layered on top as skills, not
+// built into this binary.
+package eval
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"gopkg.in/yaml.v3"
+)
+
+// ManifestFile is the conventional eval manifest name, a sibling of SKILL.md.
+const ManifestFile = "evals.yaml"
+
+// Manifest is a skill's eval definition: named suites of named cases, each a set
+// of graders that must all pass for the case to pass.
+type Manifest struct {
+	Version int     `yaml:"version" json:"version"`
+	Suites  []Suite `yaml:"suites" json:"suites"`
+}
+
+// Suite is a named group of cases (e.g. "triage-correctness", "safety").
+type Suite struct {
+	Name  string `yaml:"name" json:"name"`
+	Cases []Case `yaml:"cases" json:"cases"`
+}
+
+// Case is one named expectation, satisfied only when every grader passes.
+type Case struct {
+	Name    string       `yaml:"name" json:"name"`
+	Graders []GraderSpec `yaml:"graders" json:"graders"`
+}
+
+// GraderSpec configures one grader. It is intentionally a flat struct: each
+// grader type reads only the fields it cares about, which keeps YAML authoring
+// simple and avoids polymorphic decoding. Unknown grader types fail loudly at
+// run time rather than being silently skipped (a silently-skipped gate is worse
+// than no gate).
+type GraderSpec struct {
+	Type string `yaml:"type" json:"type"`
+	Name string `yaml:"name,omitempty" json:"name,omitempty"`
+
+	// outcome grader: the session-level qvr.outcome must equal Expect.
+	Expect string `yaml:"expect,omitempty" json:"expect,omitempty"`
+
+	// text grader: every Contains substring must appear and no Reject substring
+	// may appear in the field named by On (default: final_message).
+	On       string   `yaml:"on,omitempty" json:"on,omitempty"`
+	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
+	Reject   []string `yaml:"reject,omitempty" json:"reject,omitempty"`
+
+	// tool_sequence grader: Sequence must appear as an ordered subsequence of
+	// the session's tool calls.
+	Sequence []string `yaml:"sequence,omitempty" json:"sequence,omitempty"`
+
+	// tool_constraint grader: every ExpectTools tool must have been used, no
+	// RejectTools tool may have been used, and (when >0) the call count must not
+	// exceed MaxTools.
+	ExpectTools []string `yaml:"expectTools,omitempty" json:"expect_tools,omitempty"`
+	RejectTools []string `yaml:"rejectTools,omitempty" json:"reject_tools,omitempty"`
+
+	// skill_invocation grader: ExpectSkills must all have fired, RejectSkills
+	// none.
+	ExpectSkills []string `yaml:"expectSkills,omitempty" json:"expect_skills,omitempty"`
+	RejectSkills []string `yaml:"rejectSkills,omitempty" json:"reject_skills,omitempty"`
+
+	// behavior grader: efficiency ceilings (0 = ignored).
+	MaxTools      int   `yaml:"maxTools,omitempty" json:"max_tools,omitempty"`
+	MaxTurns      int   `yaml:"maxTurns,omitempty" json:"max_turns,omitempty"`
+	MaxDurationMs int64 `yaml:"maxDurationMs,omitempty" json:"max_duration_ms,omitempty"`
+}
+
+// Load reads and validates the evals.yaml in skillDir. Returns (nil, nil) when
+// the skill has no manifest — an un-evaluated skill is not an error.
+func Load(skillDir string) (*Manifest, error) {
+	path := filepath.Join(skillDir, ManifestFile)
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("read %s: %w", path, err)
+	}
+	return Parse(data)
+}
+
+// Parse decodes and validates a manifest from raw YAML bytes.
+func Parse(data []byte) (*Manifest, error) {
+	var m Manifest
+	if err := yaml.Unmarshal(data, &m); err != nil {
+		return nil, fmt.Errorf("parse evals.yaml: %w", err)
+	}
+	if err := m.validate(); err != nil {
+		return nil, err
+	}
+	return &m, nil
+}
+
+// validate enforces the structural invariants: a supported version, named
+// non-empty suites and cases, and only known grader types.
+func (m *Manifest) validate() error {
+	if m.Version != 1 {
+		return fmt.Errorf("evals.yaml: unsupported version %d (want 1)", m.Version)
+	}
+	if len(m.Suites) == 0 {
+		return fmt.Errorf("evals.yaml: no suites defined")
+	}
+	for _, s := range m.Suites {
+		if s.Name == "" {
+			return fmt.Errorf("evals.yaml: a suite has no name")
+		}
+		if len(s.Cases) == 0 {
+			return fmt.Errorf("evals.yaml: suite %q has no cases", s.Name)
+		}
+		for _, c := range s.Cases {
+			if err := validateCase(s.Name, c); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func validateCase(suite string, c Case) error {
+	if c.Name == "" {
+		return fmt.Errorf("evals.yaml: a case in suite %q has no name", suite)
+	}
+	if len(c.Graders) == 0 {
+		return fmt.Errorf("evals.yaml: case %q/%q has no graders", suite, c.Name)
+	}
+	for _, g := range c.Graders {
+		if _, ok := graders[g.Type]; !ok {
+			return fmt.Errorf("evals.yaml: case %q/%q uses unknown grader type %q", suite, c.Name, g.Type)
+		}
+		if err := validateGraderConfig(g); err != nil {
+			return fmt.Errorf("evals.yaml: case %q/%q %s grader %w", suite, c.Name, g.Type, err)
+		}
+	}
+	return nil
+}
+
+// Suite returns the named suite, or nil when absent.
+func (m *Manifest) Suite(name string) *Suite {
+	for i := range m.Suites {
+		if m.Suites[i].Name == name {
+			return &m.Suites[i]
+		}
+	}
+	return nil
+}
diff --git a/internal/eval/runner.go b/internal/eval/runner.go
new file mode 100644
index 0000000..3403145
--- /dev/null
+++ b/internal/eval/runner.go
@@ -0,0 +1,107 @@
+package eval
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/google/uuid"
+)
+
+// RunInput parameterizes one eval run. SessionID pins which captured session to
+// grade; when nil, the skill's most recent session is used (the "online eval"
+// default — grade the latest real run). Suite "" grades every suite.
+type RunInput struct {
+	SkillName string
+	SkillDir  string
+	Suite     string
+	SessionID *uuid.UUID
+}
+
+// RunResult is the outcome of grading one session against the manifest.
+type RunResult struct {
+	Skill     string        `json:"skill"`
+	SessionID string        `json:"session_id"`
+	Pass      bool          `json:"pass"`
+	Passed    int           `json:"passed"`
+	Failed    int           `json:"failed"`
+	Suites    []SuiteResult `json:"suites"`
+}
+
+// Run loads the skill's evals.yaml, grades a captured session against the
+// requested suite(s), and returns the result. It NEVER runs an agent or calls a
+// model — it grades spans qvr already captured. Writing the result to the
+// eval-run store (the lock × evidence join) is the caller's job.
+func Run(ctx context.Context, s store.Store, in RunInput) (*RunResult, error) {
+	man, err := Load(in.SkillDir)
+	if err != nil {
+		return nil, err
+	}
+	if man == nil {
+		return nil, fmt.Errorf("skill %q has no %s in %s", in.SkillName, ManifestFile, in.SkillDir)
+	}
+
+	suites, err := selectSuites(man, in.Suite)
+	if err != nil {
+		return nil, err
+	}
+
+	sess, err := resolveSession(ctx, s, in)
+	if err != nil {
+		return nil, err
+	}
+	spans, err := s.QuerySpans(ctx, &store.SpanFilter{SessionID: &sess.SessionID})
+	if err != nil {
+		return nil, fmt.Errorf("load spans for %s: %w", sess.SessionID, err)
+	}
+	ev := BuildEvidence(sess, spans)
+	ev.Skill = in.SkillName
+
+	res := &RunResult{Skill: in.SkillName, SessionID: ev.SessionID, Pass: true}
+	for i := range suites {
+		sr := GradeSuite(&suites[i], ev)
+		res.Suites = append(res.Suites, sr)
+		res.Passed += sr.Passed
+		res.Failed += sr.Failed
+		if !sr.Pass {
+			res.Pass = false
+		}
+	}
+	return res, nil
+}
+
+// selectSuites resolves the requested suite name to a slice — one named suite,
+// or all of them when name is empty.
+func selectSuites(man *Manifest, name string) ([]Suite, error) {
+	if name == "" {
+		return man.Suites, nil
+	}
+	su := man.Suite(name)
+	if su == nil {
+		return nil, fmt.Errorf("no suite %q in evals.yaml", name)
+	}
+	return []Suite{*su}, nil
+}
+
+// resolveSession finds the session to grade: the pinned id, or the skill's most
+// recent captured session.
+func resolveSession(ctx context.Context, s store.Store, in RunInput) (*store.SessionMetaRow, error) {
+	if in.SessionID != nil {
+		m, err := s.GetSessionMeta(ctx, *in.SessionID)
+		if err != nil {
+			return nil, fmt.Errorf("look up session %s: %w", in.SessionID, err)
+		}
+		if m == nil {
+			return nil, fmt.Errorf("no captured session %s", in.SessionID)
+		}
+		return m, nil
+	}
+	rows, err := s.ListSessionMeta(ctx, &store.SessionMetaFilter{Skill: in.SkillName, Limit: 1})
+	if err != nil {
+		return nil, fmt.Errorf("find latest session for %q: %w", in.SkillName, err)
+	}
+	if len(rows) == 0 {
+		return nil, fmt.Errorf("no captured sessions for skill %q to evaluate — run it, then `qvr audit discover`", in.SkillName)
+	}
+	return rows[0], nil
+}
diff --git a/internal/ops/derive/claude.go b/internal/ops/derive/claude.go
index 70e14f6..e7cfe81 100644
--- a/internal/ops/derive/claude.go
+++ b/internal/ops/derive/claude.go
@@ -305,10 +305,12 @@ func (t *turn) applyToolResult(b claudeBlock, ts int64) {
 		return
 	}
 	sp := &t.tools[idx]
-	sp.Attributes["gen_ai.tool.call.result"] = decodeToolResultText(b.Content)
+	result := decodeToolResultText(b.Content)
+	sp.Attributes["gen_ai.tool.call.result"] = result
 	if b.IsError {
 		sp.Attributes["error.type"] = "tool_failure"
 	}
+	sp.Attributes[OutcomeKey] = classifyOutcome(result, b.IsError)
 	if ts > sp.StartMs {
 		sp.EndMs = ts
 	}
diff --git a/internal/ops/derive/derive.go b/internal/ops/derive/derive.go
index cb512b6..6d46748 100644
--- a/internal/ops/derive/derive.go
+++ b/internal/ops/derive/derive.go
@@ -42,7 +42,14 @@ import (
 // resultDisplay (string OR object) no longer drops the whole item. Dispatch
 // is also mixed-agent safe: the majority transcript agent owns the session,
 // so a stale external writer's foreign rows can't blank it.
-const Version = 7
+// v8: outcome signal — TOOL/SKILL spans carry qvr.outcome
+// (success/failure/blocked; absent = unknown), derived from the result's error
+// flag plus a denial/interrupt marker for blocked (hook-deny lives in hook
+// payloads the deriver never sees, so transcript-silent denial is failure).
+// SessionMeta carries a worst-of-spans rollup (Outcome), and the OTLP span
+// status code now reflects it (ERROR for failure/blocked) instead of a
+// hardcoded OK. Rederive backfills outcome onto historical sessions.
+const Version = 8
 
 // KindSkill is the Quiver span category for a skill load/invocation within a
 // turn. It exists so the trace makes skill usage a first-class, queryable stage
@@ -79,6 +86,10 @@ type SessionMeta struct {
 	// only at session level (hermes, copilot), and a deriver-set value wins.
 	TokensIn  *int64
 	TokensOut *int64
+	// Outcome is the session-level rollup of its TOOL/SKILL spans' qvr.outcome:
+	// the worst seen (failure > blocked > success), or "" when no span carried an
+	// outcome (rendered unknown, never a fabricated success).
+	Outcome string
 }
 
 // Derivation is one session's full derived projection: the unified meta plus
@@ -211,6 +222,7 @@ func finalizeMeta(d *Derivation, rows []*ops.RawTrace) {
 	}
 	fillMetaFromSpans(m, d.Spans)
 	fillMetaTokens(m, d.Spans)
+	fillMetaOutcome(m, d.Spans)
 	// Formats without in-file timestamps derive zero time bounds; fall back to
 	// the capture times so the session still buckets into the right day.
 	if m.StartedMs == 0 && len(rows) > 0 {
@@ -288,6 +300,36 @@ func fillMetaTokens(m *SessionMeta, spans []Span) {
 	}
 }
 
+// outcomeRank orders outcomes by severity for the session rollup: a worse
+// outcome on any span dominates. Unknown/absent ranks below an explicit
+// success, so a session with one graded success reads success, not unknown.
+var outcomeRank = map[string]int{
+	OutcomeSuccess: 1,
+	OutcomeBlocked: 2,
+	OutcomeFailure: 3,
+}
+
+// fillMetaOutcome rolls the TOOL/SKILL spans' qvr.outcome into a single
+// session-level verdict: the worst outcome any span carried. Spans without an
+// outcome (LLM turns, calls whose result was never seen) don't vote; a session
+// where none voted stays "" (unknown), never a fabricated success.
+func fillMetaOutcome(m *SessionMeta, spans []Span) {
+	best, bestRank := "", 0
+	for _, sp := range spans {
+		if sp.Kind != KindTool && sp.Kind != KindSkill {
+			continue
+		}
+		v, ok := sp.Attributes[OutcomeKey].(string)
+		if !ok {
+			continue
+		}
+		if r := outcomeRank[v]; r > bestRank {
+			best, bestRank = v, r
+		}
+	}
+	m.Outcome = best
+}
+
 // addAttrTokens folds one span's token attribute value into a running
 // nullable sum; an absent attribute leaves the sum untouched (and nil).
 func addAttrTokens(sum **int64, v any) {
diff --git a/internal/ops/derive/outcome_test.go b/internal/ops/derive/outcome_test.go
new file mode 100644
index 0000000..4001f2f
--- /dev/null
+++ b/internal/ops/derive/outcome_test.go
@@ -0,0 +1,156 @@
+package derive_test
+
+import (
+	"testing"
+
+	"github.com/astra-sh/qvr/internal/ops"
+	"github.com/astra-sh/qvr/internal/ops/derive"
+	"github.com/google/uuid"
+)
+
+// Outcome contract (deriver v8):
+//   - TOOL/SKILL spans carry qvr.outcome ONLY once their result arrives —
+//     success when the result is not an error, failure when it is, and blocked
+//     when the error text is a user denial/interrupt (a governance signal).
+//   - A call whose result is never seen carries NO outcome (absence = unknown);
+//     we never fabricate a success.
+//   - SessionMeta.Outcome is the worst-of-spans rollup.
+//   - The OTLP span status code reflects the outcome (ERROR for failure/blocked)
+//     instead of the old hardcoded OK.
+
+// claudeToolRun builds a minimal claude session: a prompt, a Bash tool_use, and
+// its tool_result with the given text + error flag.
+func claudeToolRun(sid uuid.UUID, result string, isError bool) []*ops.RawTrace {
+	errField := ""
+	if isError {
+		errField = `,"is_error":true`
+	}
+	return []*ops.RawTrace{
+		row(sid, 0, `{"type":"user","timestamp":"2026-06-12T00:00:00.000Z","message":{"role":"user","content":"run it"}}`),
+		row(sid, 1, `{"type":"assistant","timestamp":"2026-06-12T00:00:01.000Z","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"ls"}}]}}`),
+		row(sid, 2, `{"type":"user","timestamp":"2026-06-12T00:00:02.000Z","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"tu1","content":`+jsonQuote(result)+errField+`}]}}`),
+	}
+}
+
+// jsonQuote returns a JSON string literal for s.
+func jsonQuote(s string) string {
+	out := []byte{'"'}
+	for _, r := range s {
+		switch r {
+		case '"', '\\':
+			out = append(out, '\\', byte(r))
+		case '\n':
+			out = append(out, '\\', 'n')
+		default:
+			out = append(out, []byte(string(r))...)
+		}
+	}
+	return string(append(out, '"'))
+}
+
+func TestOutcome_Success(t *testing.T) {
+	sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa01")
+	d, err := derive.DeriveSession(claudeToolRun(sid, "done", false))
+	if err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	_, tool, _ := splitSpanKinds(d.Spans)
+	wantAttrs(t, tool, map[string]any{derive.OutcomeKey: derive.OutcomeSuccess})
+	if d.Meta.Outcome != derive.OutcomeSuccess {
+		t.Errorf("meta outcome = %q, want %q", d.Meta.Outcome, derive.OutcomeSuccess)
+	}
+}
+
+func TestOutcome_Failure(t *testing.T) {
+	sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa02")
+	d, err := derive.DeriveSession(claudeToolRun(sid, "command not found", true))
+	if err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	_, tool, _ := splitSpanKinds(d.Spans)
+	wantAttrs(t, tool, map[string]any{derive.OutcomeKey: derive.OutcomeFailure})
+	if d.Meta.Outcome != derive.OutcomeFailure {
+		t.Errorf("meta outcome = %q, want %q", d.Meta.Outcome, derive.OutcomeFailure)
+	}
+	// The OTLP status code must now be ERROR(2), not the old hardcoded OK(1).
+	if code := otlpStatusForSpan(derive.ToOTLP(d.Spans), tool.SpanID); code != 2 {
+		t.Errorf("OTLP status code = %v, want 2 (ERROR) for a failed tool call", code)
+	}
+}
+
+func TestOutcome_Blocked(t *testing.T) {
+	sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa03")
+	// A user-rejected tool call: is_error with the harness denial text.
+	d, err := derive.DeriveSession(claudeToolRun(sid,
+		"The user doesn't want to proceed with this tool use. The tool use was rejected.", true))
+	if err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	_, tool, _ := splitSpanKinds(d.Spans)
+	wantAttrs(t, tool, map[string]any{derive.OutcomeKey: derive.OutcomeBlocked})
+	if d.Meta.Outcome != derive.OutcomeBlocked {
+		t.Errorf("meta outcome = %q, want %q", d.Meta.Outcome, derive.OutcomeBlocked)
+	}
+}
+
+// TestOutcome_PendingIsUnknown pins the absence invariant: a tool call whose
+// result never arrives carries no outcome, and the session rolls up to "".
+func TestOutcome_PendingIsUnknown(t *testing.T) {
+	sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa04")
+	rows := []*ops.RawTrace{
+		row(sid, 0, `{"type":"user","timestamp":"2026-06-12T00:00:00.000Z","message":{"role":"user","content":"run it"}}`),
+		row(sid, 1, `{"type":"assistant","timestamp":"2026-06-12T00:00:01.000Z","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"ls"}}]}}`),
+	}
+	d, err := derive.DeriveSession(rows)
+	if err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	_, tool, _ := splitSpanKinds(d.Spans)
+	if v, ok := tool.Attributes[derive.OutcomeKey]; ok {
+		t.Errorf("qvr.outcome present (%v) on a call with no result — fabricated outcome", v)
+	}
+	if d.Meta.Outcome != "" {
+		t.Errorf("meta outcome = %q, want \"\" (unknown) when no span carried an outcome", d.Meta.Outcome)
+	}
+}
+
+// TestOutcome_SessionWorstOf pins the rollup: one success and one failure in the
+// same session roll up to failure (the worst).
+func TestOutcome_SessionWorstOf(t *testing.T) {
+	sid := uuid.MustParse("550e8400-e29b-41d4-a716-44665544aa05")
+	rows := []*ops.RawTrace{
+		row(sid, 0, `{"type":"user","timestamp":"2026-06-12T00:00:00.000Z","message":{"role":"user","content":"run both"}}`),
+		row(sid, 1, `{"type":"assistant","timestamp":"2026-06-12T00:00:01.000Z","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"ok"}},{"type":"tool_use","id":"tu2","name":"Bash","input":{"command":"bad"}}]}}`),
+		row(sid, 2, `{"type":"user","timestamp":"2026-06-12T00:00:02.000Z","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"tu1","content":"fine"}]}}`),
+		row(sid, 3, `{"type":"user","timestamp":"2026-06-12T00:00:03.000Z","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"tu2","content":"boom","is_error":true}]}}`),
+	}
+	d, err := derive.DeriveSession(rows)
+	if err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	if d.Meta.Outcome != derive.OutcomeFailure {
+		t.Errorf("meta outcome = %q, want %q (worst-of success+failure)", d.Meta.Outcome, derive.OutcomeFailure)
+	}
+}
+
+// otlpStatusForSpan digs the status.code out of a ToOTLP envelope for one span,
+// or -1 when not found.
+func otlpStatusForSpan(env map[string]any, spanID string) int {
+	rs, _ := env["resourceSpans"].([]map[string]any)
+	for _, r := range rs {
+		ss, _ := r["scopeSpans"].([]map[string]any)
+		for _, sc := range ss {
+			spans, _ := sc["spans"].([]map[string]any)
+			for _, sp := range spans {
+				if sp["spanId"] == spanID {
+					if st, ok := sp["status"].(map[string]any); ok {
+						if c, ok := st["code"].(int); ok {
+							return c
+						}
+					}
+				}
+			}
+		}
+	}
+	return -1
+}
diff --git a/internal/ops/derive/shared.go b/internal/ops/derive/shared.go
index acd0664..4bbd922 100644
--- a/internal/ops/derive/shared.go
+++ b/internal/ops/derive/shared.go
@@ -234,6 +234,7 @@ func applyResultTo(sp *Span, result string, ts int64, isError bool) {
 	if isError {
 		sp.Attributes["error.type"] = "tool_failure"
 	}
+	sp.Attributes[OutcomeKey] = classifyOutcome(result, isError)
 	if ts > sp.StartMs {
 		sp.EndMs = ts
 	}
diff --git a/internal/ops/derive/span.go b/internal/ops/derive/span.go
index 2aa4e35..5724c41 100644
--- a/internal/ops/derive/span.go
+++ b/internal/ops/derive/span.go
@@ -17,6 +17,7 @@ package derive
 import (
 	"encoding/json"
 	"strconv"
+	"strings"
 )
 
 // Default resource/scope identifiers stamped on emitted spans. Vendor-neutral.
@@ -35,6 +36,62 @@ const (
 	KindChain = "CHAIN"
 )
 
+// OutcomeKey is the Quiver per-span result attribute on TOOL/SKILL spans: did
+// the call succeed, fail, or get blocked? It is derived purely from the
+// transcript — a tool result's error flag and, for a denied/interrupted call,
+// its text — so it stays a regenerable projection. Its ABSENCE means unknown
+// (the call never reported a result); we never fabricate a success. The session
+// read model rolls these up into SessionMeta.Outcome (worst-of-spans).
+const OutcomeKey = "qvr.outcome"
+
+const (
+	OutcomeSuccess = "success"
+	OutcomeFailure = "failure"
+	OutcomeBlocked = "blocked"
+	OutcomeUnknown = "unknown"
+)
+
+// classifyOutcome maps a tool result to an outcome. A non-error result is a
+// success; an error the transcript marks as a user denial or interrupt is
+// blocked (a governance signal, not a tool defect); any other error is a
+// failure.
+func classifyOutcome(result string, isError bool) string {
+	if !isError {
+		return OutcomeSuccess
+	}
+	if isBlockedResult(result) {
+		return OutcomeBlocked
+	}
+	return OutcomeFailure
+}
+
+// blockedResultMarkers are substrings a harness writes into a tool result when
+// the user REJECTED or INTERRUPTED the call rather than the tool failing on its
+// own (observed in real Claude Code stores). Matched case-insensitively. This
+// is the only "blocked" source available to a pure deriver: hook-deny decisions
+// live in hook payloads, which the deriver never sees (capture filters to
+// transcript rows), so denial that a transcript doesn't echo derives to failure.
+var blockedResultMarkers = []string{
+	"the user doesn't want to proceed",
+	"tool use was rejected",
+	"user doesn't want to take this action",
+	"request interrupted by user",
+	"[request interrupted",
+}
+
+func isBlockedResult(result string) bool {
+	if result == "" {
+		return false
+	}
+	low := strings.ToLower(result)
+	for _, m := range blockedResultMarkers {
+		if strings.Contains(low, m) {
+			return true
+		}
+	}
+	return false
+}
+
 // Span is one derived span. Times are epoch milliseconds. Attributes use OTel
 // GenAI semantic-convention keys (e.g. "gen_ai.operation.name",
 // "gen_ai.request.model", "gen_ai.usage.input_tokens") plus the Quiver
@@ -116,7 +173,7 @@ func (s Span) otlpSpan() map[string]any {
 		"startTimeUnixNano": msToNano(s.StartMs),
 		"endTimeUnixNano":   msToNano(end),
 		"attributes":        attrs,
-		"status":            map[string]any{"code": 1},
+		"status":            map[string]any{"code": otlpStatusCode(s.Attributes)},
 	}
 	if s.ParentSpanID != "" {
 		obj["parentSpanId"] = s.ParentSpanID
@@ -124,6 +181,22 @@ func (s Span) otlpSpan() map[string]any {
 	return obj
 }
 
+// otlpStatusCode maps a span's outcome to an OTLP status code: a failed or
+// blocked call is ERROR(2), an explicit success is OK(1), and anything without
+// an outcome (LLM turns, calls whose result was never seen) is UNSET(0), the
+// OTel default. Before this, every span emitted OK(1) unconditionally — a
+// failed tool call looked successful to any OTLP backend.
+func otlpStatusCode(attrs map[string]any) int {
+	switch attrs[OutcomeKey] {
+	case OutcomeFailure, OutcomeBlocked:
+		return 2
+	case OutcomeSuccess:
+		return 1
+	default:
+		return 0
+	}
+}
+
 // ToOTLP renders a batch of spans as a single OTLP resourceSpans payload. For
 // an empty batch it returns a well-formed envelope with an empty resourceSpans
 // array — never a nil/`null` body — so a consumer always receives valid OTLP
diff --git a/internal/ops/rawtrace/capture.go b/internal/ops/rawtrace/capture.go
index 92da4aa..8e790d7 100644
--- a/internal/ops/rawtrace/capture.go
+++ b/internal/ops/rawtrace/capture.go
@@ -134,6 +134,7 @@ func persistDerivation(ctx context.Context, s Store, sessionID uuid.UUID) (int,
 		Skills:          d.Meta.Skills,
 		TokensIn:        d.Meta.TokensIn,
 		TokensOut:       d.Meta.TokensOut,
+		Outcome:         d.Meta.Outcome,
 		DeriverVersion:  derive.Version,
 	}
 	return len(out), hasSkill, s.ReplaceSessionDerivation(ctx, meta, out)
diff --git a/internal/ops/store/annotations.go b/internal/ops/store/annotations.go
new file mode 100644
index 0000000..c3db406
--- /dev/null
+++ b/internal/ops/store/annotations.go
@@ -0,0 +1,118 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+// AnnotationRow is one human verdict on a captured session — the outer-loop
+// feedback signal. It is durable: derivation (`qvr audit rederive`) never
+// rewrites it, unlike the spans and session_meta projections. A whole-session
+// verdict leaves Skill empty; a per-skill verdict scopes it to one skill.
+type AnnotationRow struct {
+	SessionID uuid.UUID `json:"session_id"`
+	Skill     string    `json:"skill,omitempty"`
+	Outcome   string    `json:"outcome"` // the reviewer's verdict (e.g. good, bad, blocked)
+	Note      string    `json:"note,omitempty"`
+	Author    string    `json:"author,omitempty"`
+	CreatedAt time.Time `json:"created_at"`
+}
+
+// AnnotationFilter scopes ListAnnotations. Nil/zero fields are ignored; the
+// result is newest-first by created_at.
+type AnnotationFilter struct {
+	SessionID *uuid.UUID
+	Skill     string
+	Since     *time.Time
+}
+
+// PutAnnotation appends one human verdict. Annotations are append-only — a
+// session can accrue several over time — so this never replaces a prior row.
+func (s *sqliteStore) PutAnnotation(ctx context.Context, a *AnnotationRow) error {
+	if a == nil {
+		return fmt.Errorf("store: nil annotation")
+	}
+	if a.Outcome == "" {
+		return fmt.Errorf("store: annotation outcome is required")
+	}
+	if a.CreatedAt.IsZero() {
+		a.CreatedAt = time.Now().UTC()
+	}
+	_, err := s.db.ExecContext(ctx,
+		`INSERT INTO annotations(session_id, skill, outcome, note, author, created_at)
+		 VALUES (?,?,?,?,?,?)`,
+		a.SessionID.String(), nullableString(a.Skill), a.Outcome,
+		nullableString(a.Note), nullableString(a.Author), a.CreatedAt.UTC())
+	if err != nil {
+		return fmt.Errorf("store: put annotation: %w", err)
+	}
+	return nil
+}
+
+// ListAnnotations returns the verdicts matching the filter, newest-first.
+func (s *sqliteStore) ListAnnotations(ctx context.Context, f *AnnotationFilter) ([]*AnnotationRow, error) {
+	var where []string
+	var args []any
+	if f != nil {
+		if f.SessionID != nil {
+			where = append(where, "session_id = ?")
+			args = append(args, f.SessionID.String())
+		}
+		if f.Skill != "" {
+			where = append(where, "skill = ?")
+			args = append(args, f.Skill)
+		}
+		if f.Since != nil {
+			where = append(where, "created_at >= ?")
+			args = append(args, f.Since.UTC())
+		}
+	}
+	q := `SELECT session_id, skill, outcome, note, author, created_at FROM annotations`
+	if len(where) > 0 {
+		q += " WHERE " + strings.Join(where, " AND ")
+	}
+	q += " ORDER BY created_at DESC"
+
+	rows, err := s.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, fmt.Errorf("store: list annotations: %w", err)
+	}
+	defer rows.Close()
+
+	var out []*AnnotationRow
+	for rows.Next() {
+		a, err := scanAnnotation(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, a)
+	}
+	return out, rows.Err()
+}
+
+func scanAnnotation(row interface{ Scan(...any) error }) (*AnnotationRow, error) {
+	var (
+		a                   AnnotationRow
+		sid                 string
+		skill, note, author sql.NullString
+		createdAt           time.Time
+	)
+	if err := row.Scan(&sid, &skill, &a.Outcome, &note, &author, &createdAt); err != nil {
+		return nil, fmt.Errorf("store: scan annotation: %w", err)
+	}
+	id, err := uuid.Parse(sid)
+	if err != nil {
+		return nil, fmt.Errorf("store: bad annotation session_id %q: %w", sid, err)
+	}
+	a.SessionID = id
+	a.Skill = skill.String
+	a.Note = note.String
+	a.Author = author.String
+	a.CreatedAt = createdAt.UTC()
+	return &a, nil
+}
diff --git a/internal/ops/store/annotations_test.go b/internal/ops/store/annotations_test.go
new file mode 100644
index 0000000..7cfae24
--- /dev/null
+++ b/internal/ops/store/annotations_test.go
@@ -0,0 +1,104 @@
+package store
+
+import (
+	"context"
+	"testing"
+
+	"github.com/google/uuid"
+)
+
+// TestAnnotations_RoundTrip pins the write/read path: a verdict reads back with
+// its fields, and filters scope by session and skill.
+func TestAnnotations_RoundTrip(t *testing.T) {
+	st := openTestStore(t)
+	ctx := context.Background()
+	sid := uuid.New()
+
+	if err := st.PutAnnotation(ctx, &AnnotationRow{
+		SessionID: sid, Skill: "triage-issue", Outcome: "bad",
+		Note: "ambiguous: needs a setting", Author: "rakshith",
+	}); err != nil {
+		t.Fatalf("put annotation: %v", err)
+	}
+
+	got, err := st.ListAnnotations(ctx, &AnnotationFilter{SessionID: &sid})
+	if err != nil {
+		t.Fatalf("list: %v", err)
+	}
+	if len(got) != 1 {
+		t.Fatalf("want 1 annotation, got %d", len(got))
+	}
+	a := got[0]
+	if a.Skill != "triage-issue" || a.Outcome != "bad" ||
+		a.Note != "ambiguous: needs a setting" || a.Author != "rakshith" {
+		t.Errorf("annotation did not round-trip: %+v", a)
+	}
+	if a.CreatedAt.IsZero() {
+		t.Error("created_at was not stamped")
+	}
+
+	// Skill filter matches the per-skill verdict.
+	bySkill, _ := st.ListAnnotations(ctx, &AnnotationFilter{Skill: "triage-issue"})
+	if len(bySkill) != 1 {
+		t.Errorf("skill filter: want 1, got %d", len(bySkill))
+	}
+	none, _ := st.ListAnnotations(ctx, &AnnotationFilter{Skill: "other"})
+	if len(none) != 0 {
+		t.Errorf("skill filter (other): want 0, got %d", len(none))
+	}
+}
+
+// TestAnnotations_RequiresOutcome pins the validation guard.
+func TestAnnotations_RequiresOutcome(t *testing.T) {
+	st := openTestStore(t)
+	if err := st.PutAnnotation(context.Background(), &AnnotationRow{SessionID: uuid.New()}); err == nil {
+		t.Error("expected an error for a verdict with no outcome")
+	}
+}
+
+// TestAnnotations_SurviveRederive proves the whole point of a separate table:
+// re-deriving a session (which replaces spans + session_meta) does NOT touch
+// the human verdict.
+func TestAnnotations_SurviveRederive(t *testing.T) {
+	st := openTestStore(t)
+	ctx := context.Background()
+	sid := uuid.New()
+
+	if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "claude-code", 1000, "triage-issue"), nil); err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	if err := st.PutAnnotation(ctx, &AnnotationRow{SessionID: sid, Skill: "triage-issue", Outcome: "bad"}); err != nil {
+		t.Fatalf("annotate: %v", err)
+	}
+	// Re-derive (the rederive path uses this same write).
+	if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "claude-code", 1000, "triage-issue"), nil); err != nil {
+		t.Fatalf("re-derive: %v", err)
+	}
+
+	got, _ := st.ListAnnotations(ctx, &AnnotationFilter{SessionID: &sid})
+	if len(got) != 1 {
+		t.Fatalf("annotation did not survive rederive: got %d", len(got))
+	}
+}
+
+// TestAnnotations_ClearedOnDeleteSession proves a hard session purge removes
+// its verdicts (nothing left to annotate).
+func TestAnnotations_ClearedOnDeleteSession(t *testing.T) {
+	st := openTestStore(t)
+	ctx := context.Background()
+	sid := uuid.New()
+
+	if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "codex", 1000, "tdd"), nil); err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	if err := st.PutAnnotation(ctx, &AnnotationRow{SessionID: sid, Outcome: "good"}); err != nil {
+		t.Fatalf("annotate: %v", err)
+	}
+	if _, err := st.DeleteSession(ctx, sid); err != nil {
+		t.Fatalf("delete: %v", err)
+	}
+	got, _ := st.ListAnnotations(ctx, &AnnotationFilter{SessionID: &sid})
+	if len(got) != 0 {
+		t.Errorf("annotations survived DeleteSession: %d", len(got))
+	}
+}
diff --git a/internal/ops/store/eval_runs.go b/internal/ops/store/eval_runs.go
new file mode 100644
index 0000000..26da6de
--- /dev/null
+++ b/internal/ops/store/eval_runs.go
@@ -0,0 +1,177 @@
+package store
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"strings"
+	"time"
+)
+
+// EvalRunRow is one recorded eval run, keyed by {skill_name, skill_commit} so it
+// joins to spans/lineage by the exact locked commit that ran. Cases carry the
+// per-case verdicts; they are written with the run and loaded on demand.
+type EvalRunRow struct {
+	ID          int64         `json:"id"`
+	SkillName   string        `json:"skill_name"`
+	SkillCommit string        `json:"skill_commit"`
+	Suite       string        `json:"suite"`
+	SessionID   string        `json:"session_id,omitempty"`
+	StartedAt   time.Time     `json:"started_at"`
+	Passed      int           `json:"passed"`
+	Failed      int           `json:"failed"`
+	Pass        bool          `json:"pass"`
+	Cases       []EvalCaseRow `json:"cases,omitempty"`
+}
+
+// EvalCaseRow is one case's verdict within a run.
+type EvalCaseRow struct {
+	Suite  string `json:"suite"`
+	Case   string `json:"case"`
+	Pass   bool   `json:"pass"`
+	Detail string `json:"detail,omitempty"`
+}
+
+// EvalRunFilter scopes ListEvalRuns. Nil/zero fields are ignored; results are
+// newest-first by started_at.
+type EvalRunFilter struct {
+	SkillName   string
+	SkillCommit string
+	Since       *time.Time
+	Limit       int
+}
+
+// PutEvalRun inserts a run and its case rows in one tx, returning the new run id.
+func (s *sqliteStore) PutEvalRun(ctx context.Context, r *EvalRunRow) (int64, error) {
+	if r == nil {
+		return 0, fmt.Errorf("store: nil eval run")
+	}
+	if r.StartedAt.IsZero() {
+		r.StartedAt = time.Now().UTC()
+	}
+	tx, err := s.db.BeginTx(ctx, nil)
+	if err != nil {
+		return 0, fmt.Errorf("store: eval run tx: %w", err)
+	}
+	defer func() { _ = tx.Rollback() }()
+
+	res, err := tx.ExecContext(ctx,
+		`INSERT INTO eval_runs(skill_name, skill_commit, suite, session_id, started_at, passed, failed, pass)
+		 VALUES (?,?,?,?,?,?,?,?)`,
+		r.SkillName, r.SkillCommit, r.Suite, nullableString(r.SessionID),
+		r.StartedAt.UTC(), r.Passed, r.Failed, boolToInt(r.Pass))
+	if err != nil {
+		return 0, fmt.Errorf("store: insert eval run: %w", err)
+	}
+	id, _ := res.LastInsertId()
+	for _, c := range r.Cases {
+		if _, err := tx.ExecContext(ctx,
+			`INSERT INTO eval_case_results(eval_run_id, suite, case_name, pass, detail)
+			 VALUES (?,?,?,?,?)`,
+			id, c.Suite, c.Case, boolToInt(c.Pass), nullableString(c.Detail)); err != nil {
+			return 0, fmt.Errorf("store: insert eval case: %w", err)
+		}
+	}
+	if err := tx.Commit(); err != nil {
+		return 0, fmt.Errorf("store: eval run commit: %w", err)
+	}
+	r.ID = id
+	return id, nil
+}
+
+// ListEvalRuns returns runs matching the filter, newest-first, each with its
+// case rows loaded.
+func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*EvalRunRow, error) {
+	var where []string
+	var args []any
+	if f != nil {
+		if f.SkillName != "" {
+			where = append(where, "skill_name = ?")
+			args = append(args, f.SkillName)
+		}
+		if f.SkillCommit != "" {
+			where = append(where, "skill_commit = ?")
+			args = append(args, f.SkillCommit)
+		}
+		if f.Since != nil {
+			where = append(where, "started_at >= ?")
+			args = append(args, f.Since.UTC())
+		}
+	}
+	q := `SELECT id, skill_name, skill_commit, suite, session_id, started_at, passed, failed, pass FROM eval_runs`
+	if len(where) > 0 {
+		q += " WHERE " + strings.Join(where, " AND ")
+	}
+	q += " ORDER BY started_at DESC"
+	if f != nil && f.Limit > 0 {
+		q += " LIMIT ?"
+		args = append(args, f.Limit)
+	}
+
+	rows, err := s.db.QueryContext(ctx, q, args...)
+	if err != nil {
+		return nil, fmt.Errorf("store: list eval runs: %w", err)
+	}
+	defer rows.Close()
+
+	var out []*EvalRunRow
+	for rows.Next() {
+		r, err := scanEvalRun(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, r)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	for _, r := range out {
+		cases, err := s.evalCases(ctx, r.ID)
+		if err != nil {
+			return nil, err
+		}
+		r.Cases = cases
+	}
+	return out, nil
+}
+
+func (s *sqliteStore) evalCases(ctx context.Context, runID int64) ([]EvalCaseRow, error) {
+	rows, err := s.db.QueryContext(ctx,
+		`SELECT suite, case_name, pass, detail FROM eval_case_results WHERE eval_run_id = ?`, runID)
+	if err != nil {
+		return nil, fmt.Errorf("store: list eval cases: %w", err)
+	}
+	defer rows.Close()
+	var out []EvalCaseRow
+	for rows.Next() {
+		var (
+			c      EvalCaseRow
+			p      int
+			detail sql.NullString
+		)
+		if err := rows.Scan(&c.Suite, &c.Case, &p, &detail); err != nil {
+			return nil, fmt.Errorf("store: scan eval case: %w", err)
+		}
+		c.Pass = p != 0
+		c.Detail = detail.String
+		out = append(out, c)
+	}
+	return out, rows.Err()
+}
+
+func scanEvalRun(row interface{ Scan(...any) error }) (*EvalRunRow, error) {
+	var (
+		r         EvalRunRow
+		sessionID sql.NullString
+		passInt   int
+		startedAt time.Time
+	)
+	if err := row.Scan(&r.ID, &r.SkillName, &r.SkillCommit, &r.Suite, &sessionID,
+		&startedAt, &r.Passed, &r.Failed, &passInt); err != nil {
+		return nil, fmt.Errorf("store: scan eval run: %w", err)
+	}
+	r.SessionID = sessionID.String
+	r.StartedAt = startedAt.UTC()
+	r.Pass = passInt != 0
+	return &r, nil
+}
diff --git a/internal/ops/store/eval_runs_test.go b/internal/ops/store/eval_runs_test.go
new file mode 100644
index 0000000..75c63a0
--- /dev/null
+++ b/internal/ops/store/eval_runs_test.go
@@ -0,0 +1,66 @@
+package store
+
+import (
+	"context"
+	"testing"
+)
+
+// TestEvalRuns_RoundTrip pins the write/read path: a run and its case rows land
+// atomically, read back newest-first, and filter by skill commit.
+func TestEvalRuns_RoundTrip(t *testing.T) {
+	st := openTestStore(t)
+	ctx := context.Background()
+
+	// Two runs of the same skill at different commits: the older fails, the
+	// newer (post-fix) passes — the article's fail→pass.
+	if _, err := st.PutEvalRun(ctx, &EvalRunRow{
+		SkillName: "triage-issue", SkillCommit: "aaaaaaa", Suite: "triage-correctness",
+		Passed: 0, Failed: 1, Pass: false,
+		Cases: []EvalCaseRow{{Suite: "triage-correctness", Case: "needs-info", Pass: false, Detail: "text: missing needs-info"}},
+	}); err != nil {
+		t.Fatalf("put failing run: %v", err)
+	}
+	id2, err := st.PutEvalRun(ctx, &EvalRunRow{
+		SkillName: "triage-issue", SkillCommit: "bbbbbbb", Suite: "triage-correctness",
+		Passed: 1, Failed: 0, Pass: true,
+		Cases: []EvalCaseRow{{Suite: "triage-correctness", Case: "needs-info", Pass: true}},
+	})
+	if err != nil {
+		t.Fatalf("put passing run: %v", err)
+	}
+	if id2 == 0 {
+		t.Fatal("expected a non-zero run id")
+	}
+
+	all, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue"})
+	if err != nil {
+		t.Fatalf("list: %v", err)
+	}
+	if len(all) != 2 {
+		t.Fatalf("want 2 runs, got %d", len(all))
+	}
+	// Newest-first: the passing run leads.
+	if !all[0].Pass || all[0].SkillCommit != "bbbbbbb" {
+		t.Errorf("newest run wrong: %+v", all[0])
+	}
+	if len(all[0].Cases) != 1 || !all[0].Cases[0].Pass {
+		t.Errorf("case rows did not round-trip: %+v", all[0].Cases)
+	}
+	if len(all[1].Cases) != 1 || all[1].Cases[0].Detail != "text: missing needs-info" {
+		t.Errorf("failing case detail did not round-trip: %+v", all[1].Cases)
+	}
+
+	// Commit filter isolates the passing run.
+	byCommit, _ := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue", SkillCommit: "bbbbbbb"})
+	if len(byCommit) != 1 || !byCommit[0].Pass {
+		t.Errorf("commit filter: want 1 passing, got %+v", byCommit)
+	}
+}
+
+// TestEvalRuns_RequiresRow pins the nil guard.
+func TestEvalRuns_RequiresRow(t *testing.T) {
+	st := openTestStore(t)
+	if _, err := st.PutEvalRun(context.Background(), nil); err == nil {
+		t.Error("expected an error for a nil eval run")
+	}
+}
diff --git a/internal/ops/store/migrations/0007_session_meta_outcome.sql b/internal/ops/store/migrations/0007_session_meta_outcome.sql
new file mode 100644
index 0000000..8c9062c
--- /dev/null
+++ b/internal/ops/store/migrations/0007_session_meta_outcome.sql
@@ -0,0 +1,6 @@
+-- 0007: nullable session-level outcome on the unified session read model.
+-- The worst-of-spans rollup of each session's TOOL/SKILL qvr.outcome
+-- attribute (success/failure/blocked), derived at derive time. NULL means no
+-- span carried an outcome — consumers render "unknown", never a fabricated
+-- success. Regenerated by `qvr audit rederive` after the v8 deriver bump.
+ALTER TABLE session_meta ADD COLUMN outcome TEXT;
diff --git a/internal/ops/store/migrations/0008_annotations.sql b/internal/ops/store/migrations/0008_annotations.sql
new file mode 100644
index 0000000..e35f90b
--- /dev/null
+++ b/internal/ops/store/migrations/0008_annotations.sql
@@ -0,0 +1,19 @@
+-- 0008: human annotations on captured sessions — the outer-loop feedback
+-- signal (a reviewer's verdict on what a skill actually did, with a reason).
+-- Unlike spans and session_meta, an annotation is NOT a derived projection:
+-- `qvr audit rederive` regenerates those from raw, which would erase a human
+-- edit written there. The verdict therefore lives in its own table that
+-- derivation never rewrites. A NULL skill is a whole-session annotation; a set
+-- skill scopes the verdict to one skill within the session. Append-only: a
+-- session can accrue several verdicts over time (the latest by created_at wins
+-- for consumers that want one).
+CREATE TABLE IF NOT EXISTS annotations (
+  session_id  TEXT NOT NULL,
+  skill       TEXT,
+  outcome     TEXT NOT NULL,
+  note        TEXT,
+  author      TEXT,
+  created_at  DATETIME NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_annotations_session ON annotations(session_id);
+CREATE INDEX IF NOT EXISTS idx_annotations_skill ON annotations(skill);
diff --git a/internal/ops/store/migrations/0009_eval_runs.sql b/internal/ops/store/migrations/0009_eval_runs.sql
new file mode 100644
index 0000000..340ea8c
--- /dev/null
+++ b/internal/ops/store/migrations/0009_eval_runs.sql
@@ -0,0 +1,27 @@
+-- 0009: eval results, keyed by {skill_name, skill_commit} — the lock × evidence
+-- join that is qvr's moat. An eval run grades a captured session against the
+-- skill's evals.yaml; the verdict is pinned to the exact locked commit that
+-- ran, so lineage can show "at commit abc the safety suite failed; at def it
+-- passed". Per-case rows carry the granular pass/fail + reason.
+CREATE TABLE IF NOT EXISTS eval_runs (
+  id            INTEGER PRIMARY KEY AUTOINCREMENT,
+  skill_name    TEXT NOT NULL,
+  skill_commit  TEXT NOT NULL,
+  suite         TEXT NOT NULL,         -- requested suite, or '*' for all
+  session_id    TEXT,                  -- the graded session
+  started_at    DATETIME NOT NULL,
+  passed        INTEGER NOT NULL,
+  failed        INTEGER NOT NULL,
+  pass          INTEGER NOT NULL       -- 1 only when every case passed
+);
+CREATE INDEX IF NOT EXISTS idx_eval_runs_skill ON eval_runs(skill_name, skill_commit);
+CREATE INDEX IF NOT EXISTS idx_eval_runs_started ON eval_runs(started_at);
+
+CREATE TABLE IF NOT EXISTS eval_case_results (
+  eval_run_id   INTEGER NOT NULL,
+  suite         TEXT NOT NULL,
+  case_name     TEXT NOT NULL,
+  pass          INTEGER NOT NULL,
+  detail        TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_eval_case_run ON eval_case_results(eval_run_id);
diff --git a/internal/ops/store/scan.go b/internal/ops/store/scan.go
index 907e025..6853183 100644
--- a/internal/ops/store/scan.go
+++ b/internal/ops/store/scan.go
@@ -28,3 +28,11 @@ func nullInt64Ptr(v sql.NullInt64) *int64 {
 	n := v.Int64
 	return &n
 }
+
+// boolToInt maps a Go bool to SQLite's 0/1 integer encoding.
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
diff --git a/internal/ops/store/session_meta.go b/internal/ops/store/session_meta.go
index c1d7f91..fc4726f 100644
--- a/internal/ops/store/session_meta.go
+++ b/internal/ops/store/session_meta.go
@@ -32,8 +32,12 @@ type SessionMetaRow struct {
 	Skills          []string  `json:"skills,omitempty"` // distinct skill names, first-use order
 	// Session token totals. nil = the native store reported no usage on that
 	// side (serialized as absent, rendered n/a — never 0).
-	TokensIn       *int64    `json:"tokens_in,omitempty"`
-	TokensOut      *int64    `json:"tokens_out,omitempty"`
+	TokensIn  *int64 `json:"tokens_in,omitempty"`
+	TokensOut *int64 `json:"tokens_out,omitempty"`
+	// Outcome is the session-level rollup of its TOOL/SKILL spans' qvr.outcome
+	// (worst-of: failure > blocked > success). "" = no span carried an outcome
+	// (serialized as absent, rendered "unknown" — never success).
+	Outcome        string    `json:"outcome,omitempty"`
 	DeriverVersion int       `json:"deriver_version"`
 	DerivedAt      time.Time `json:"derived_at"`
 }
@@ -70,25 +74,37 @@ type SessionMetaFilter struct {
 
 const sessionMetaColumns = `session_id, agent_name, source_session_id, source_path,
   working_directory, git_branch, model, title, started_ms, ended_ms,
-  turns, tools, skills, tokens_in, tokens_out, deriver_version, derived_at`
+  turns, tools, skills, tokens_in, tokens_out, outcome, deriver_version, derived_at`
 
-// sessionMetaColumnsLegacy reads a DB that predates migration 0006 (opened
-// read-only by `qvr ui --no-discover`, so the migration can't apply): the
-// token columns come back as NULL (n/a) instead of failing the whole list.
+// sessionMetaColumnsNoOutcome reads a DB at schema 0006 (tokens present, no
+// outcome yet — e.g. opened read-only before migration 0007 can apply): the
+// outcome column comes back NULL (unknown) instead of failing the whole list.
+const sessionMetaColumnsNoOutcome = `session_id, agent_name, source_session_id, source_path,
+  working_directory, git_branch, model, title, started_ms, ended_ms,
+  turns, tools, skills, tokens_in, tokens_out, NULL, deriver_version, derived_at`
+
+// sessionMetaColumnsLegacy reads a DB that predates migration 0006: both the
+// token columns and outcome come back NULL (n/a) instead of failing the list.
 const sessionMetaColumnsLegacy = `session_id, agent_name, source_session_id, source_path,
   working_directory, git_branch, model, title, started_ms, ended_ms,
-  turns, tools, skills, NULL, NULL, deriver_version, derived_at`
+  turns, tools, skills, NULL, NULL, NULL, deriver_version, derived_at`
 
 // staleTokensSchema reports the pre-0006 "tokens columns missing" condition.
 func staleTokensSchema(err error) bool {
 	return err != nil && strings.Contains(err.Error(), "no such column: tokens_in")
 }
 
+// staleOutcomeSchema reports the pre-0007 "outcome column missing" condition (a
+// DB already at 0006, so tokens resolve but outcome doesn't).
+func staleOutcomeSchema(err error) bool {
+	return err != nil && strings.Contains(err.Error(), "no such column: outcome")
+}
+
 const upsertSessionMetaSQL = `INSERT OR REPLACE INTO session_meta(
   session_id, agent_name, source_session_id, source_path, working_directory,
   git_branch, model, title, started_ms, ended_ms, turns, tools, skills,
-  tokens_in, tokens_out, deriver_version, derived_at
-) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)`
+  tokens_in, tokens_out, outcome, deriver_version, derived_at
+) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)`
 
 // ReplaceSessionDerivation atomically replaces a session's whole derived
 // projection — its session_meta row and all its spans — in one tx. This is the
@@ -125,7 +141,7 @@ func (s *sqliteStore) ReplaceSessionDerivation(ctx context.Context, meta *Sessio
 		nullableString(meta.Model), nullableString(meta.Title),
 		meta.StartedMs, meta.EndedMs, meta.Turns, meta.Tools, skills,
 		nullableInt64(meta.TokensIn), nullableInt64(meta.TokensOut),
-		meta.DeriverVersion, meta.DerivedAt.UTC(),
+		nullableString(meta.Outcome), meta.DeriverVersion, meta.DerivedAt.UTC(),
 	); err != nil {
 		return fmt.Errorf("store: upsert session meta: %w", err)
 	}
@@ -135,26 +151,27 @@ func (s *sqliteStore) ReplaceSessionDerivation(ctx context.Context, meta *Sessio
 // ListSessionMeta returns unified session rows newest-first (by start time),
 // or by total tokens when the filter asks for it.
 func (s *sqliteStore) ListSessionMeta(ctx context.Context, f *SessionMetaFilter) ([]*SessionMetaRow, error) {
-	out, err := s.listSessionMeta(ctx, f, false)
+	// Cascade down the schema as columns go missing on read-only opens that
+	// predate a migration: full → no-outcome (0006) → legacy (pre-0006).
+	out, err := s.listSessionMeta(ctx, f, sessionMetaColumns, true)
+	if staleOutcomeSchema(err) {
+		out, err = s.listSessionMeta(ctx, f, sessionMetaColumnsNoOutcome, true)
+	}
 	if staleTokensSchema(err) {
-		return s.listSessionMeta(ctx, f, true)
+		out, err = s.listSessionMeta(ctx, f, sessionMetaColumnsLegacy, false)
 	}
 	return out, err
 }
 
-func (s *sqliteStore) listSessionMeta(ctx context.Context, f *SessionMetaFilter, legacy bool) ([]*SessionMetaRow, error) {
+func (s *sqliteStore) listSessionMeta(ctx context.Context, f *SessionMetaFilter, cols string, tokenSort bool) ([]*SessionMetaRow, error) {
 	where, args := sessionMetaWhere(f)
-	cols := sessionMetaColumns
-	if legacy {
-		cols = sessionMetaColumnsLegacy
-	}
 	q := `SELECT ` + cols + ` FROM session_meta`
 	if len(where) > 0 {
 		q += " WHERE " + strings.Join(where, " AND ")
 	}
 	// On a legacy schema a token sort degrades to the default order — every
 	// row's tokens read n/a, so there is no token order to honor.
-	if f != nil && f.SortByTokens && !legacy {
+	if f != nil && f.SortByTokens && tokenSort {
 		q += ` ORDER BY (tokens_in IS NULL AND tokens_out IS NULL),
 		  (COALESCE(tokens_in,0)+COALESCE(tokens_out,0)) DESC, started_ms DESC`
 	} else {
@@ -185,6 +202,9 @@ func (s *sqliteStore) listSessionMeta(ctx context.Context, f *SessionMetaFilter,
 // GetSessionMeta returns one session's unified row, or nil when absent.
 func (s *sqliteStore) GetSessionMeta(ctx context.Context, sessionID uuid.UUID) (*SessionMetaRow, error) {
 	m, err := s.getSessionMeta(ctx, sessionID, sessionMetaColumns)
+	if staleOutcomeSchema(err) {
+		m, err = s.getSessionMeta(ctx, sessionID, sessionMetaColumnsNoOutcome)
+	}
 	if staleTokensSchema(err) {
 		return s.getSessionMeta(ctx, sessionID, sessionMetaColumnsLegacy)
 	}
@@ -248,16 +268,18 @@ func scanSessionMeta(row interface{ Scan(...any) error }) (*SessionMetaRow, erro
 		srcSession, srcPath, wd        sql.NullString
 		branch, model, title, skillsJS sql.NullString
 		tokensIn, tokensOut            sql.NullInt64
+		outcome                        sql.NullString
 		derivedAt                      time.Time
 	)
 	if err := row.Scan(&sid, &m.AgentName, &srcSession, &srcPath, &wd,
 		&branch, &model, &title, &m.StartedMs, &m.EndedMs,
-		&m.Turns, &m.Tools, &skillsJS, &tokensIn, &tokensOut,
+		&m.Turns, &m.Tools, &skillsJS, &tokensIn, &tokensOut, &outcome,
 		&m.DeriverVersion, &derivedAt); err != nil {
 		return nil, fmt.Errorf("store: scan session meta: %w", err)
 	}
 	m.TokensIn = nullInt64Ptr(tokensIn)
 	m.TokensOut = nullInt64Ptr(tokensOut)
+	m.Outcome = outcome.String
 	id, err := uuid.Parse(sid)
 	if err != nil {
 		return nil, fmt.Errorf("store: bad session_meta session_id %q: %w", sid, err)
diff --git a/internal/ops/store/session_meta_test.go b/internal/ops/store/session_meta_test.go
index 7126ea5..b69649d 100644
--- a/internal/ops/store/session_meta_test.go
+++ b/internal/ops/store/session_meta_test.go
@@ -205,6 +205,35 @@ func i64Equal(a, b *int64) bool {
 	return *a == *b
 }
 
+// TestSessionMetaOutcome_RoundTrip pins the nullable outcome column: a set
+// verdict reads back, and "" (no span carried an outcome) reads back as "".
+func TestSessionMetaOutcome_RoundTrip(t *testing.T) {
+	st := openTestStore(t)
+	ctx := context.Background()
+
+	withOutcome, unknown := uuid.New(), uuid.New()
+	o := metaFor(withOutcome, "claude-code", 1000, "code-review")
+	o.Outcome = "failure"
+	if err := st.ReplaceSessionDerivation(ctx, o, nil); err != nil {
+		t.Fatalf("derive with outcome: %v", err)
+	}
+	if err := st.ReplaceSessionDerivation(ctx, metaFor(unknown, "codex", 2000, "tdd"), nil); err != nil {
+		t.Fatalf("derive without outcome: %v", err)
+	}
+
+	got, err := st.GetSessionMeta(ctx, withOutcome)
+	if err != nil || got == nil {
+		t.Fatalf("get: %v %v", got, err)
+	}
+	if got.Outcome != "failure" {
+		t.Errorf("outcome = %q, want failure", got.Outcome)
+	}
+	got2, _ := st.GetSessionMeta(ctx, unknown)
+	if got2 == nil || got2.Outcome != "" {
+		t.Errorf("outcome = %q, want \"\" (unknown)", got2.Outcome)
+	}
+}
+
 // TestGetSessionMeta_AbsentIsNil pins the no-row contract: nil, not an error.
 func TestGetSessionMeta_AbsentIsNil(t *testing.T) {
 	st := openTestStore(t)
diff --git a/internal/ops/store/sqlite.go b/internal/ops/store/sqlite.go
index e919aba..77fa03f 100644
--- a/internal/ops/store/sqlite.go
+++ b/internal/ops/store/sqlite.go
@@ -145,6 +145,12 @@ func (s *sqliteStore) DeleteSession(ctx context.Context, sessionID uuid.UUID) (i
 	if _, err := tx.ExecContext(ctx, `DELETE FROM trace_cursors WHERE session_id = ?`, id); err != nil {
 		return 0, fmt.Errorf("store: delete session cursor: %w", err)
 	}
+	// Annotations are human verdicts on this session; a hard session delete
+	// removes them too (nothing left to annotate). A rederive does NOT reach
+	// here, so verdicts survive re-derivation — only a full purge clears them.
+	if _, err := tx.ExecContext(ctx, `DELETE FROM annotations WHERE session_id = ?`, id); err != nil {
+		return 0, fmt.Errorf("store: delete session annotations: %w", err)
+	}
 	if err := tx.Commit(); err != nil {
 		return 0, fmt.Errorf("store: delete session commit: %w", err)
 	}
diff --git a/internal/ops/store/store.go b/internal/ops/store/store.go
index ceea15c..2f5b741 100644
--- a/internal/ops/store/store.go
+++ b/internal/ops/store/store.go
@@ -129,6 +129,20 @@ type Store interface {
 	PutLockSnapshots(ctx context.Context, sessionID uuid.UUID, rows []*LockSnapshotRow) error
 	GetLockSnapshots(ctx context.Context, sessionID uuid.UUID) (map[string]*LockSnapshotRow, error)
 
+	// PutAnnotation appends a human verdict on a session (the outer-loop
+	// feedback signal); ListAnnotations reads them back, newest-first. Unlike
+	// spans/session_meta, annotations are NOT regenerated by rederive — they are
+	// durable human input. See annotations.go + migration 0008.
+	PutAnnotation(ctx context.Context, a *AnnotationRow) error
+	ListAnnotations(ctx context.Context, f *AnnotationFilter) ([]*AnnotationRow, error)
+
+	// PutEvalRun records one eval run (verdict + per-case rows), keyed by
+	// {skill_name, skill_commit} — the lock × evidence join. ListEvalRuns reads
+	// them back newest-first for the gate and for lineage. See eval_runs.go +
+	// migration 0009.
+	PutEvalRun(ctx context.Context, r *EvalRunRow) (int64, error)
+	ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*EvalRunRow, error)
+
 	// DeleteRawBefore sweeps raw rows captured before cutoff. Returns the
 	// number of rows deleted.
 	DeleteRawBefore(ctx context.Context, cutoff time.Time) (int64, error)

From 9b007fda7e1c37c2edbfb18a87e7538abff730f0 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Tue, 16 Jun 2026 23:41:18 -0400
Subject: [PATCH 02/10] =?UTF-8?q?feat(skills):=20evolve-skill=20=E2=80=94?=
 =?UTF-8?q?=20the=20self-improvement=20orchestrator?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A qvr-installable skill that runs the outer loop over the CLI: observe
(qvr audit) -> edit (qvr edit) -> re-run -> gate (qvr ops eval run) ->
open a PR. Evidence-gated; never auto-merges or auto-installs. Ships a CI
recipe for running the loop unattended.
---
 skills/evolve-skill/SKILL.md                | 183 ++++++++++++++++++++
 skills/evolve-skill/references/ci-recipe.md |  85 +++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 skills/evolve-skill/SKILL.md
 create mode 100644 skills/evolve-skill/references/ci-recipe.md

diff --git a/skills/evolve-skill/SKILL.md b/skills/evolve-skill/SKILL.md
new file mode 100644
index 0000000..96a9810
--- /dev/null
+++ b/skills/evolve-skill/SKILL.md
@@ -0,0 +1,183 @@
+---
+name: evolve-skill
+description: >
+  Runs a self-improvement loop on an installed skill: read what it actually did
+  (auto outcomes + human verdicts), draft a fix to its SKILL.md, and keep the fix
+  only if qvr's eval gate shows the score improve. Use when a user wants a skill
+  to get better from feedback over time — e.g. "improve my triage skill from the
+  feedback", "evolve this skill", "close the loop on skill quality", "auto-tune a
+  skill against its evals", or "why did this skill regress and how do I fix it".
+  Drives qvr audit + qvr ops; never auto-installs or auto-merges — it opens a PR
+  for a human to merge. Experimental.
+metadata:
+  author: quiver-playground
+  version: "1.0.0"
+---
+
+# Evolve a skill with qvr's eval gate
+
+This is the **outer loop** of skill self-improvement, run as an agent over qvr's
+CLI. The **inner loop** already happened: agents used the skill, and qvr captured
+every run — lock-verified to the exact installed commit — via `qvr audit`. Your
+job here is to read those runs plus any human verdicts, draft an improvement to
+the skill's `SKILL.md`, and **prove it helps before it ships** by re-running the
+skill and grading it with `qvr ops eval`. A candidate that doesn't improve the
+eval score is discarded; one that does is opened as a pull request for a human to
+merge.
+
+The discipline is deliberate: qvr is **evidence-gated**. Skills do not auto-evolve
+from raw model confidence — every change passes a deterministic eval keyed to the
+skill's locked commit, because curated-with-evidence beats blind self-generation.
+
+## When to use this
+
+- A skill has been running for a while and the user wants it to improve from how
+  it actually performed (failures, human corrections), not from a guess.
+- The user has recorded verdicts with `qvr audit annotate` and wants them acted on.
+- A skill regressed after an edit and the user wants the loop to find and fix it.
+
+This **changes a skill's content** behind an eval gate. It is not observability
+(that's `trace-skill-activity`), supply-chain verification
+(`verify-skill-supply-chain`), or install management (`onboard-skills`).
+
+## Prerequisites
+
+- `qvr audit` capturing the target skill's runs (`qvr audit enable` + `discover`;
+  see `trace-skill-activity`).
+- An **`evals.yaml`** beside the skill's `SKILL.md` — qvr's own manifest of
+  suites/cases graded deterministically over the captured trace (no model calls).
+  If the skill has none, write one first: encode the behaviors that matter,
+  starting with any run a human flagged as wrong (a regression case).
+
+A minimal `evals.yaml`:
+
+```yaml
+version: 1
+suites:
+  - name: triage-correctness
+    cases:
+      - name: ambiguous-feature-needs-info
+        graders:
+          - type: outcome          # the captured run did not error
+            expect: success
+          - type: text             # and produced the right label
+            on: final_message
+            contains: ["needs-info"]
+            reject: ["ready-to-implement"]
+```
+
+Grader types available (all deterministic, all over the trace qvr already has):
+`outcome`, `text`, `tool_sequence`, `tool_constraint`, `skill_invocation`,
+`behavior`. Semantic ("is this explanation clear?") judgement is **not** a core
+grader — delegate that to a judge skill and fold its verdict back in as an
+annotation; keep the gate itself model-free.
+
+## The loop
+
+### 1. Observe — read how the skill actually did
+
+```
+qvr audit logs --kind SKILL --since 30d        # where the skill fired
+qvr audit sessions --skill <name>              # its runs, with auto outcomes
+qvr audit annotations --skill <name>           # human verdicts ("bad: …, because …")
+qvr audit export --session <id> -o run.jsonl   # one run verbatim, for close reading
+```
+
+Auto **outcome** (`success`/`failure`/`blocked`) tells you *that* a run went
+wrong; a human **annotation** tells you *why*. Cluster the failures + notes into
+one concrete weakness — the smallest change that would have fixed the flagged
+runs. Record a verdict yourself if one is missing:
+
+```
+qvr audit annotate <session-id> --skill <name> --outcome bad --note "why it was wrong"
+```
+
+### 2. Update — draft the fix
+
+Eject a writable copy and edit it:
+
+```
+qvr edit <name>                 # promote the symlinked skill to a real dir
+# …edit <dir>/SKILL.md to address the weakness…
+qvr diff <name>                 # review the change
+```
+
+Make the **smallest** change that addresses the observed failures. Add the
+flagged input as a regression case in `evals.yaml` if it isn't already one.
+
+### 3. Roll out the candidate — generate fresh evidence
+
+Run the edited skill on the held-out / regression inputs (this is a real agent
+run — execution stays with the agent, not qvr), then let qvr capture it:
+
+```
+# …invoke the skill on the regression input(s)…
+qvr audit discover --agent <agent>   # capture the fresh run(s)
+```
+
+### 4. Evaluate — the gate
+
+```
+qvr ops eval run <name> --suite <suite>
+```
+
+This grades the skill's most recent captured run against the suite, **keyed to
+the locked commit**, and exits non-zero if any case fails. **Keep the edit only
+if the score strictly improved** — i.e. cases that failed before now pass. If it
+did not improve, discard the change (`git checkout` the ejected dir or revert)
+and go back to step 1 with what you learned. Inspect the trail any time:
+
+```
+qvr ops lineage <name>          # eval verdicts + annotations over commits
+```
+
+### 5. Ship — open a PR, never merge
+
+Once the gate passes, publish/commit the change on a branch and **open a pull
+request for a human to review and merge**. Do **not** merge it yourself, and do
+**not** auto-install the new version — a passing eval clears it for review, not
+for an automatic landing. Summarize: the weakness observed, the change made, and
+the eval score before → after.
+
+```
+qvr ops promote <name>          # confirms the locked commit has a passing eval
+```
+
+`qvr ops promote` is the gate made explicit: it refuses (non-zero exit) unless
+the locked commit is backed by a passing eval, so a CI step or the loop can
+branch on it.
+
+## Running it unattended
+
+The same loop runs on a schedule via CI — a scheduled job that proposes a PR,
+plus a PR check that fails when `qvr ops eval run` regresses. See
+[`references/ci-recipe.md`](references/ci-recipe.md). The schedule and the agent
+runner live outside qvr (qvr is the substrate, not a scheduler); qvr provides the
+capture, the gate, and the lineage.
+
+## Gotchas
+
+- **Evidence-gated, always.** A change ships only behind a passing eval keyed to
+  its commit. No eval improvement ⇒ no change. Resist "the model is confident."
+- **Never auto-merge, never auto-install.** Open a PR; a human merges. A passing
+  eval is a green light to *review*, not to land.
+- **Annotations are the "why."** Auto outcome alone can't tell a correct-but-
+  unhelpful run from a good one; the human verdict carries the intent the loop
+  optimizes toward.
+- **Keep the gate model-free.** Deterministic graders over the real trace are the
+  core; push LLM judgement to a separate judge skill so the gate stays
+  reproducible and cheap.
+- **Experimental.** `qvr ops` / `qvr audit` command names and the `evals.yaml`
+  schema may change; pin your qvr version if you script the loop.
+
+## Troubleshooting
+
+- *`skill … has no evals.yaml`* — write one beside `SKILL.md` (see Prerequisites).
+- *`no captured sessions for skill … to evaluate`* — run the skill, then
+  `qvr audit discover`, before `qvr ops eval run`.
+- *The eval passes but the skill still feels wrong* — your suite is under-
+  specifying the behavior. Add the failing case; the loop is only as good as its
+  evals.
+- *Lineage shows no commit move* — the edited skill wasn't re-committed, so the
+  eval is keyed to the same commit. Commit the change (or `qvr edit`/`publish`)
+  so the gate distinguishes before from after.
diff --git a/skills/evolve-skill/references/ci-recipe.md b/skills/evolve-skill/references/ci-recipe.md
new file mode 100644
index 0000000..3dd69bc
--- /dev/null
+++ b/skills/evolve-skill/references/ci-recipe.md
@@ -0,0 +1,85 @@
+# CI recipe: the outer loop on a schedule
+
+qvr is the substrate — capture, the eval gate, and lineage — not a scheduler or
+an agent runner. To run the self-improvement loop unattended you wire two CI
+jobs around qvr's CLI. Both are illustrative GitHub Actions; adapt to any
+scheduler + cloud-agent platform.
+
+## Job A — PR regression gate (run on every pull request)
+
+Fails the PR when a changed skill's evals regress. This is the cheap, always-on
+half: it never edits anything, it just runs the deterministic gate.
+
+```yaml
+name: skill-evals
+on: pull_request
+jobs:
+  eval-gate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: curl -fsSL https://quiver.sh/install | sh    # install qvr
+      - run: qvr audit enable && qvr audit discover
+      # Grade each skill that ships an evals.yaml; non-zero exit fails the job.
+      - run: |
+          for s in $(qvr ls --output json | jq -r '.[].name'); do
+            qvr ops eval run "$s" || exit 1
+          done
+```
+
+The gate grades the skill's most recent **captured** run — so the PR's CI must
+first exercise the skill (your existing test/integration step), then
+`qvr audit discover` captures it, then `qvr ops eval run` judges it. No agent is
+run by qvr itself.
+
+## Job B — scheduled improver (run on a cron)
+
+Proposes improvements. This half needs an **agent** (a cloud coding agent / the
+`evolve-skill` loop) and a **scheduler** — both outside qvr. The agent drives the
+exact CLI loop from `SKILL.md`: observe → update → roll out → evaluate → open a
+PR.
+
+```yaml
+name: evolve-skills
+on:
+  schedule:
+    - cron: "0 6 * * *"      # daily; tune to how fast feedback accrues
+jobs:
+  evolve:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: curl -fsSL https://quiver.sh/install | sh
+      - run: qvr audit enable && qvr audit discover
+      # Hand the loop to your agent runner. It reads `qvr audit annotations` +
+      # `qvr audit export`, edits the skill behind `qvr edit`, re-runs it,
+      # gates on `qvr ops eval run`, and opens a PR. It MUST NOT merge.
+      - run: your-agent-runner run --skill evolve-skill --target "$SKILL"
+        env:
+          SKILL: triage-issue
+```
+
+## Rules the scheduled job must honor
+
+- **Open a PR; never merge.** A passing eval clears a change for human review,
+  not for an automatic landing. The job's output is a PR, full stop.
+- **Never auto-install.** Do not `qvr add` the candidate into anyone's project as
+  part of the loop.
+- **Gate every change.** No PR without a `qvr ops eval run` pass recorded for the
+  candidate's commit — confirm with `qvr ops promote <skill>` (non-zero exit
+  blocks).
+- **No AI attribution in the PR.** Title and body describe the change and the
+  before→after eval score; they carry no assistant byline or footer.
+
+## What qvr provides vs what you provide
+
+| Step | Provided by |
+|------|-------------|
+| Capture runs, lock-verified | **qvr** (`audit discover`) |
+| Human + auto feedback | **qvr** (`audit annotate` / outcome) |
+| Eval gate, keyed to commit | **qvr** (`ops eval run`) |
+| Lineage (score over commits) | **qvr** (`ops lineage`) |
+| Promotion gate | **qvr** (`ops promote`) |
+| Scheduling (cron) | **you / CI** |
+| Running the agent + drafting the diff | **you / a cloud agent** |
+| Opening the PR | **you / CI** |

From abe5b42ccf9f5f6e77703c6124b02ed4249e0e36 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Tue, 16 Jun 2026 23:50:29 -0400
Subject: [PATCH 03/10] fix(ops): address greptile review findings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ops promote: surface store errors from latestPassingEval instead of
  swallowing them — a transient DB failure no longer reads as "no passing
  eval" and silently blocks/misleads a CI gate (P1).
- ops eval: evalRunRow takes the suite as a parameter rather than closing
  over the flag global, so it is pure and testable in isolation.
- eval evidence: lastAssistantText now requires role=="assistant" before
  returning content, guarding against a future multi-message output capture.
- eval evidence: Evidence.Tools now counts the same tool-sequence the
  tool_constraint grader walks, so a `maxTools` ceiling means the same thing
  under behavior and tool_constraint graders.
- migration 0009: eval_case_results.eval_run_id is now a FK to
  eval_runs(id) ON DELETE CASCADE (FK enforcement is on), so case rows can't
  be orphaned by a future eval-run prune.
---
 cmd/ops_eval.go                               |  9 ++++----
 cmd/ops_promote.go                            | 22 ++++++++++++-------
 internal/eval/evidence.go                     |  9 ++++++--
 .../ops/store/migrations/0009_eval_runs.sql   |  2 +-
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/cmd/ops_eval.go b/cmd/ops_eval.go
index 1d1c04b..d9fc5e5 100644
--- a/cmd/ops_eval.go
+++ b/cmd/ops_eval.go
@@ -85,7 +85,7 @@ func runOpsEval(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	if _, err := s.PutEvalRun(cmd.Context(), evalRunRow(rs, result)); err != nil {
+	if _, err := s.PutEvalRun(cmd.Context(), evalRunRow(rs, result, evalSuite)); err != nil {
 		return fmt.Errorf("record eval run: %w", err)
 	}
 
@@ -99,9 +99,10 @@ func runOpsEval(cmd *cobra.Command, args []string) error {
 	return nil
 }
 
-// evalRunRow flattens a run result into the persisted row + its case rows.
-func evalRunRow(rs *resolvedSkill, r *eval.RunResult) *store.EvalRunRow {
-	suite := evalSuite
+// evalRunRow flattens a run result into the persisted row + its case rows. The
+// requested suite is passed in (not read from the flag global) so the function
+// is pure and testable in isolation.
+func evalRunRow(rs *resolvedSkill, r *eval.RunResult, suite string) *store.EvalRunRow {
 	if suite == "" {
 		suite = "*"
 	}
diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go
index 97eca56..2e6e1a6 100644
--- a/cmd/ops_promote.go
+++ b/cmd/ops_promote.go
@@ -54,7 +54,10 @@ func runOpsPromote(cmd *cobra.Command, args []string) error {
 	}
 	defer s.Close()
 
-	passing := latestPassingEval(cmd, s, rs)
+	passing, err := latestPassingEval(cmd, s, rs)
+	if err != nil {
+		return fmt.Errorf("look up eval history for %s: %w", rs.Name, err)
+	}
 	decision := promoteDecision(rs, passing)
 
 	if outputFormat == "json" {
@@ -98,20 +101,23 @@ func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow) promoteDecisi
 }
 
 // latestPassingEval returns the newest passing eval run for the skill's locked
-// commit, or nil. A skill with no locked commit ("") can't be evidence-gated by
-// commit, so this returns nil (promotion then requires --force-no-eval).
-func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) *store.EvalRunRow {
+// commit, or nil when there is none. A skill with no locked commit ("") can't be
+// evidence-gated by commit, so it returns nil (promotion then requires
+// --force-no-eval). A store error is propagated, NOT swallowed — otherwise a
+// transient DB failure would read as "no passing eval" and silently block (or
+// mislead) a CI gate.
+func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) (*store.EvalRunRow, error) {
 	if rs.Commit == "" {
-		return nil
+		return nil, nil
 	}
 	runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit})
 	if err != nil {
-		return nil
+		return nil, err
 	}
 	for _, r := range runs { // newest-first
 		if r.Pass {
-			return r
+			return r, nil
 		}
 	}
-	return nil
+	return nil, nil
 }
diff --git a/internal/eval/evidence.go b/internal/eval/evidence.go
index 11a4cdd..8c9bbbb 100644
--- a/internal/eval/evidence.go
+++ b/internal/eval/evidence.go
@@ -34,7 +34,6 @@ func BuildEvidence(meta *store.SessionMetaRow, spans []*store.SpanRow) *Evidence
 		SessionID:  meta.SessionID.String(),
 		Outcome:    meta.Outcome,
 		Skills:     append([]string(nil), meta.Skills...),
-		Tools:      int(meta.Tools),
 		Turns:      int(meta.Turns),
 		DurationMs: meta.DurationMs(),
 	}
@@ -57,6 +56,12 @@ func BuildEvidence(meta *store.SessionMetaRow, spans []*store.SpanRow) *Evidence
 		}
 	}
 	e.FinalMessage = lastLLMText
+	// Tools counts the same tool-named spans the tool_constraint grader walks
+	// (ToolSequence, which includes the skill-load call), so a `maxTools` ceiling
+	// means the same thing whether an author writes it under a behavior or a
+	// tool_constraint grader. (Distinct from meta.Tools, which excludes SKILL
+	// spans and feeds the session list, not the gate.)
+	e.Tools = len(e.ToolSequence)
 	return e
 }
 
@@ -87,7 +92,7 @@ func lastAssistantText(attrs map[string]any) string {
 		return ""
 	}
 	for _, m := range slices.Backward(msgs) {
-		if m.Content != "" {
+		if m.Role == "assistant" && m.Content != "" {
 			return m.Content
 		}
 	}
diff --git a/internal/ops/store/migrations/0009_eval_runs.sql b/internal/ops/store/migrations/0009_eval_runs.sql
index 340ea8c..de12808 100644
--- a/internal/ops/store/migrations/0009_eval_runs.sql
+++ b/internal/ops/store/migrations/0009_eval_runs.sql
@@ -18,7 +18,7 @@ CREATE INDEX IF NOT EXISTS idx_eval_runs_skill ON eval_runs(skill_name, skill_co
 CREATE INDEX IF NOT EXISTS idx_eval_runs_started ON eval_runs(started_at);
 
 CREATE TABLE IF NOT EXISTS eval_case_results (
-  eval_run_id   INTEGER NOT NULL,
+  eval_run_id   INTEGER NOT NULL REFERENCES eval_runs(id) ON DELETE CASCADE,
   suite         TEXT NOT NULL,
   case_name     TEXT NOT NULL,
   pass          INTEGER NOT NULL,

From 99ba8fa92e39bb39087b55362155f82dc277d47f Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Tue, 16 Jun 2026 23:58:17 -0400
Subject: [PATCH 04/10] fix(ops): promoteDecision takes flags as parameters

Mirror the evalRunRow refactor: promoteDecision now receives force/reason
as explicit arguments instead of reading the Cobra flag globals, so the
pure gate logic is independently testable and its sub-tests no longer
mutate package state.
---
 cmd/ops_promote.go |  8 ++++----
 cmd/ops_test.go    | 11 ++++-------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go
index 2e6e1a6..db7a508 100644
--- a/cmd/ops_promote.go
+++ b/cmd/ops_promote.go
@@ -58,7 +58,7 @@ func runOpsPromote(cmd *cobra.Command, args []string) error {
 	if err != nil {
 		return fmt.Errorf("look up eval history for %s: %w", rs.Name, err)
 	}
-	decision := promoteDecision(rs, passing)
+	decision := promoteDecision(rs, passing, promoteForce, promoteReason)
 
 	if outputFormat == "json" {
 		if err := printer.JSON(decision); err != nil {
@@ -84,13 +84,13 @@ type promoteDecisionResult struct {
 	Message  string `json:"message"`
 }
 
-func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow) promoteDecisionResult {
-	d := promoteDecisionResult{Skill: rs.Name, Commit: shortCommit(rs.Commit), Reason: promoteReason}
+func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow, force bool, reason string) promoteDecisionResult {
+	d := promoteDecisionResult{Skill: rs.Name, Commit: shortCommit(rs.Commit), Reason: reason}
 	switch {
 	case passing != nil:
 		d.Promoted = true
 		d.Message = fmt.Sprintf("%s @ %s is backed by a passing eval (run #%d) — clear to promote", rs.Name, shortCommit(rs.Commit), passing.ID)
-	case promoteForce:
+	case force:
 		d.Promoted = true
 		d.Forced = true
 		d.Message = fmt.Sprintf("%s @ %s promoted WITHOUT a passing eval (--force-no-eval)", rs.Name, shortCommit(rs.Commit))
diff --git a/cmd/ops_test.go b/cmd/ops_test.go
index fb37894..f972e2c 100644
--- a/cmd/ops_test.go
+++ b/cmd/ops_test.go
@@ -87,29 +87,26 @@ func TestOpsPromote_Gate(t *testing.T) {
 }
 
 // TestPromoteDecision unit-tests the pure gate logic across its three branches.
+// Inputs are passed explicitly (no flag globals), so sub-tests stay independent.
 func TestPromoteDecision(t *testing.T) {
 	rs := &resolvedSkill{Name: "guard-tests", Commit: "abc1234"}
 
 	t.Run("passing eval clears it", func(t *testing.T) {
-		promoteForce = false
-		d := promoteDecision(rs, &store.EvalRunRow{ID: 7, Pass: true})
+		d := promoteDecision(rs, &store.EvalRunRow{ID: 7, Pass: true}, false, "")
 		if !d.Promoted || d.Forced {
 			t.Errorf("want promoted, not forced: %+v", d)
 		}
 	})
 	t.Run("no eval refuses", func(t *testing.T) {
-		promoteForce = false
-		if d := promoteDecision(rs, nil); d.Promoted {
+		if d := promoteDecision(rs, nil, false, ""); d.Promoted {
 			t.Errorf("want refusal, got %+v", d)
 		}
 	})
 	t.Run("force overrides", func(t *testing.T) {
-		promoteForce = true
-		d := promoteDecision(rs, nil)
+		d := promoteDecision(rs, nil, true, "shipping anyway")
 		if !d.Promoted || !d.Forced {
 			t.Errorf("want forced promote: %+v", d)
 		}
-		promoteForce = false
 	})
 }
 

From d25873d50b36926c7cc55cfb1662391a82ba05b7 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Wed, 17 Jun 2026 00:07:44 -0400
Subject: [PATCH 05/10] fix(ops): graceful read-only fallback + no orphaned
 eval refs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 3 greptile findings:

- ListEvalRuns / ListAnnotations now return an empty result (not a raw
  "no such table" error) when their table is absent — the case a read-only
  open hits on a DB that predates migrations 0008/0009 (read-only opens skip
  migration apply). `qvr ops lineage` and `qvr audit annotations` therefore
  degrade gracefully on a freshly-upgraded install. New noSuchTable() guard.
- DeleteSession nulls eval_runs.session_id for the deleted session instead
  of leaving it dangling. The eval verdict is durable lineage evidence keyed
  by {skill, commit}, so it outlives the session; only the stale pointer is
  cleared.
---
 internal/ops/store/annotations.go      |  3 ++
 internal/ops/store/annotations_test.go | 18 ++++++++++
 internal/ops/store/eval_runs.go        |  3 ++
 internal/ops/store/eval_runs_test.go   | 50 ++++++++++++++++++++++++++
 internal/ops/store/scan.go             | 13 ++++++-
 internal/ops/store/sqlite.go           |  7 ++++
 6 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/internal/ops/store/annotations.go b/internal/ops/store/annotations.go
index c3db406..3874991 100644
--- a/internal/ops/store/annotations.go
+++ b/internal/ops/store/annotations.go
@@ -80,6 +80,9 @@ func (s *sqliteStore) ListAnnotations(ctx context.Context, f *AnnotationFilter)
 
 	rows, err := s.db.QueryContext(ctx, q, args...)
 	if err != nil {
+		if noSuchTable(err) {
+			return nil, nil // pre-0008 DB opened read-only: no annotations yet, not an error
+		}
 		return nil, fmt.Errorf("store: list annotations: %w", err)
 	}
 	defer rows.Close()
diff --git a/internal/ops/store/annotations_test.go b/internal/ops/store/annotations_test.go
index 7cfae24..b72e35a 100644
--- a/internal/ops/store/annotations_test.go
+++ b/internal/ops/store/annotations_test.go
@@ -56,6 +56,24 @@ func TestAnnotations_RequiresOutcome(t *testing.T) {
 	}
 }
 
+// TestListAnnotations_MissingTableIsEmpty pins the read-only-upgrade fallback: a
+// DB predating migration 0008 yields an empty result, not a "no such table"
+// error (the path `qvr audit annotations` / `ops lineage` hit read-only).
+func TestListAnnotations_MissingTableIsEmpty(t *testing.T) {
+	st := openTestStore(t)
+	sq, ok := st.(*sqliteStore)
+	if !ok {
+		t.Fatal("expected *sqliteStore")
+	}
+	if _, err := sq.db.Exec(`DROP TABLE annotations;`); err != nil {
+		t.Fatalf("drop table: %v", err)
+	}
+	got, err := st.ListAnnotations(context.Background(), &AnnotationFilter{Skill: "x"})
+	if err != nil || got != nil {
+		t.Errorf("missing table: got (%v, %v), want (nil, nil)", got, err)
+	}
+}
+
 // TestAnnotations_SurviveRederive proves the whole point of a separate table:
 // re-deriving a session (which replaces spans + session_meta) does NOT touch
 // the human verdict.
diff --git a/internal/ops/store/eval_runs.go b/internal/ops/store/eval_runs.go
index 26da6de..0be7fc6 100644
--- a/internal/ops/store/eval_runs.go
+++ b/internal/ops/store/eval_runs.go
@@ -110,6 +110,9 @@ func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*Ev
 
 	rows, err := s.db.QueryContext(ctx, q, args...)
 	if err != nil {
+		if noSuchTable(err) {
+			return nil, nil // pre-0009 DB opened read-only: no runs yet, not an error
+		}
 		return nil, fmt.Errorf("store: list eval runs: %w", err)
 	}
 	defer rows.Close()
diff --git a/internal/ops/store/eval_runs_test.go b/internal/ops/store/eval_runs_test.go
index 75c63a0..b3da2e7 100644
--- a/internal/ops/store/eval_runs_test.go
+++ b/internal/ops/store/eval_runs_test.go
@@ -3,6 +3,8 @@ package store
 import (
 	"context"
 	"testing"
+
+	"github.com/google/uuid"
 )
 
 // TestEvalRuns_RoundTrip pins the write/read path: a run and its case rows land
@@ -64,3 +66,51 @@ func TestEvalRuns_RequiresRow(t *testing.T) {
 		t.Error("expected an error for a nil eval run")
 	}
 }
+
+// TestListEvalRuns_MissingTableIsEmpty pins the read-only-upgrade fallback: a DB
+// that predates migration 0009 (tables absent, migrations skipped on a
+// read-only open) yields an empty result, not a "no such table" error.
+func TestListEvalRuns_MissingTableIsEmpty(t *testing.T) {
+	st := openTestStore(t)
+	sq, ok := st.(*sqliteStore)
+	if !ok {
+		t.Fatal("expected *sqliteStore")
+	}
+	if _, err := sq.db.Exec(`DROP TABLE eval_case_results; DROP TABLE eval_runs;`); err != nil {
+		t.Fatalf("drop tables: %v", err)
+	}
+	got, err := st.ListEvalRuns(context.Background(), &EvalRunFilter{SkillName: "x"})
+	if err != nil || got != nil {
+		t.Errorf("missing table: got (%v, %v), want (nil, nil)", got, err)
+	}
+}
+
+// TestDeleteSession_NullsEvalRunSession proves an eval verdict OUTLIVES the
+// session it graded (durable lineage), with its now-stale session_id nulled.
+func TestDeleteSession_NullsEvalRunSession(t *testing.T) {
+	st := openTestStore(t)
+	ctx := context.Background()
+	sid := uuid.New()
+	if err := st.ReplaceSessionDerivation(ctx, metaFor(sid, "claude-code", 1000, "guard-tests"), nil); err != nil {
+		t.Fatalf("derive: %v", err)
+	}
+	if _, err := st.PutEvalRun(ctx, &EvalRunRow{
+		SkillName: "guard-tests", SkillCommit: "abc1234", Suite: "s",
+		SessionID: sid.String(), Passed: 1, Pass: true,
+	}); err != nil {
+		t.Fatalf("put eval run: %v", err)
+	}
+	if _, err := st.DeleteSession(ctx, sid); err != nil {
+		t.Fatalf("delete session: %v", err)
+	}
+	runs, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "guard-tests"})
+	if err != nil {
+		t.Fatalf("list: %v", err)
+	}
+	if len(runs) != 1 {
+		t.Fatalf("eval verdict did not survive session delete: got %d", len(runs))
+	}
+	if runs[0].SessionID != "" {
+		t.Errorf("stale session_id not cleared: %q", runs[0].SessionID)
+	}
+}
diff --git a/internal/ops/store/scan.go b/internal/ops/store/scan.go
index 6853183..2859c54 100644
--- a/internal/ops/store/scan.go
+++ b/internal/ops/store/scan.go
@@ -1,6 +1,17 @@
 package store
 
-import "database/sql"
+import (
+	"database/sql"
+	"strings"
+)
+
+// noSuchTable reports a "missing table" error — the condition a read-only open
+// hits when the DB predates a migration that adds a table (read-only opens skip
+// migrations.Apply). Callers degrade to an empty result, mirroring the
+// stale-column fallback the session_meta read path uses.
+func noSuchTable(err error) bool {
+	return err != nil && strings.Contains(err.Error(), "no such table")
+}
 
 // nullableString maps "" → SQL NULL so empty Go strings round-trip as NULL
 // columns rather than empty-string columns.
diff --git a/internal/ops/store/sqlite.go b/internal/ops/store/sqlite.go
index 77fa03f..9cde201 100644
--- a/internal/ops/store/sqlite.go
+++ b/internal/ops/store/sqlite.go
@@ -151,6 +151,13 @@ func (s *sqliteStore) DeleteSession(ctx context.Context, sessionID uuid.UUID) (i
 	if _, err := tx.ExecContext(ctx, `DELETE FROM annotations WHERE session_id = ?`, id); err != nil {
 		return 0, fmt.Errorf("store: delete session annotations: %w", err)
 	}
+	// An eval_run is keyed by {skill, commit} and is durable lineage evidence,
+	// so it OUTLIVES the graded session — but its session_id would otherwise
+	// dangle. Null it instead of deleting the verdict, removing the stale
+	// pointer while keeping the gate history intact.
+	if _, err := tx.ExecContext(ctx, `UPDATE eval_runs SET session_id = NULL WHERE session_id = ?`, id); err != nil {
+		return 0, fmt.Errorf("store: clear session eval refs: %w", err)
+	}
 	if err := tx.Commit(); err != nil {
 		return 0, fmt.Errorf("store: delete session commit: %w", err)
 	}

From 53bdf9fd3834e7acca1f589d43215f129e0051d4 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Wed, 17 Jun 2026 00:15:47 -0400
Subject: [PATCH 06/10] fix(ops): actionable promote refusal + annotations
 created_at index

Round 4 greptile findings:

- ops promote: when a skill has no locked commit, the refusal now points
  straight at --force-no-eval instead of suggesting `qvr ops eval run`, which
  would record under an empty commit and loop back to the same refusal.
- migration 0008: index annotations(created_at) so the --since range filter
  on `audit annotations` / `ops lineage` uses a B-tree seek, not a scan.
---
 cmd/ops_promote.go                                 | 14 ++++++++++++--
 cmd/ops_test.go                                    |  8 +++++---
 internal/ops/store/migrations/0008_annotations.sql |  2 ++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go
index db7a508..8f42d78 100644
--- a/cmd/ops_promote.go
+++ b/cmd/ops_promote.go
@@ -68,12 +68,22 @@ func runOpsPromote(cmd *cobra.Command, args []string) error {
 		printer.Info(decision.Message)
 	}
 	if !decision.Promoted {
-		return fmt.Errorf("refusing to promote %s: no passing eval for commit %s (run `qvr ops eval run %s`, or pass --force-no-eval)",
-			rs.Name, shortCommit(rs.Commit), rs.Name)
+		return promoteRefusalError(rs)
 	}
 	return nil
 }
 
+// promoteRefusalError builds the refusal, guiding the user to the path that can
+// actually unblock them. For an unlocked skill (no commit), `qvr ops eval run`
+// would just record under an empty commit and loop back to this same refusal —
+// so --force-no-eval is the only real way forward and the message says so.
+func promoteRefusalError(rs *resolvedSkill) error {
+	if rs.Commit == "" {
+		return fmt.Errorf("refusing to promote %s: it has no locked commit to evidence-gate on — pass --force-no-eval to promote anyway, or install it through the lock so eval verdicts can pin to its commit", rs.Name)
+	}
+	return fmt.Errorf("refusing to promote %s: no passing eval for commit %s — run `qvr ops eval run %s`, or pass --force-no-eval", rs.Name, shortCommit(rs.Commit), rs.Name)
+}
+
 // promoteDecisionResult is the JSON/text shape of a promote check.
 type promoteDecisionResult struct {
 	Skill    string `json:"skill"`
diff --git a/cmd/ops_test.go b/cmd/ops_test.go
index f972e2c..17fac15 100644
--- a/cmd/ops_test.go
+++ b/cmd/ops_test.go
@@ -64,9 +64,11 @@ func TestOpsPromote_Gate(t *testing.T) {
 	dir := writeGuardTestsFixture(t)
 	seedSession(t, cfg, sessionSeed{StartedMs: 1000, Skill: "guard-tests", Outcome: "success", Tools: []string{"Bash"}})
 
-	// No eval recorded yet → refuse.
-	if _, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir); err == nil {
-		t.Error("expected promote to refuse without a passing eval")
+	// No eval recorded yet (and no locked commit) → refuse, and point at the
+	// only path that can actually unblock: --force-no-eval.
+	_, _, err := runRoot(t, nil, "ops", "promote", "guard-tests", "--skill-dir", dir)
+	if err == nil || !strings.Contains(err.Error(), "force-no-eval") {
+		t.Errorf("want a refusal guiding to --force-no-eval, got %v", err)
 	}
 
 	// Forced → allowed, and the JSON records the override.
diff --git a/internal/ops/store/migrations/0008_annotations.sql b/internal/ops/store/migrations/0008_annotations.sql
index e35f90b..75c3cb6 100644
--- a/internal/ops/store/migrations/0008_annotations.sql
+++ b/internal/ops/store/migrations/0008_annotations.sql
@@ -17,3 +17,5 @@ CREATE TABLE IF NOT EXISTS annotations (
 );
 CREATE INDEX IF NOT EXISTS idx_annotations_session ON annotations(session_id);
 CREATE INDEX IF NOT EXISTS idx_annotations_skill ON annotations(skill);
+-- created_at backs the --since range filter on `audit annotations` / `ops lineage`.
+CREATE INDEX IF NOT EXISTS idx_annotations_created ON annotations(created_at);

From ada7d2ba8dbe4af06d590882df6227e18817965e Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Wed, 17 Jun 2026 00:24:21 -0400
Subject: [PATCH 07/10] fix(eval): validate text `on` field + document
 annotations no-FK choice
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 5 greptile findings:

- text grader: the `on` field is now validated at load time (only
  final_message is accepted), so a typo like `on: message` is a manifest
  error instead of silently grading FinalMessage. evidenceField returns ""
  for any unknown field as defense-in-depth.
- migration 0008: document why annotations deliberately has NO
  session_meta FK — a cascade FK would be fired by the rederive
  INSERT-OR-REPLACE and wipe the durable verdicts the table exists to keep;
  cleanup on a real purge lives in store.DeleteSession.
---
 internal/eval/eval_test.go                        |  2 ++
 internal/eval/graders.go                          | 15 ++++++++++++---
 .../ops/store/migrations/0008_annotations.sql     |  7 +++++++
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/internal/eval/eval_test.go b/internal/eval/eval_test.go
index 489a590..acc0423 100644
--- a/internal/eval/eval_test.go
+++ b/internal/eval/eval_test.go
@@ -21,6 +21,8 @@ func TestParse_RejectsNoOpGraders(t *testing.T) {
 		{"outcome no expect", "type: outcome", true},
 		{"text ok", "type: text\n            contains: [\"x\"]", false},
 		{"text empty", "type: text", true},
+		{"text on final_message", "type: text\n            on: final_message\n            contains: [\"x\"]", false},
+		{"text on unknown", "type: text\n            on: message\n            contains: [\"x\"]", true},
 		{"sequence empty", "type: tool_sequence", true},
 		{"constraint empty", "type: tool_constraint", true},
 		{"constraint maxTools ok", "type: tool_constraint\n            maxTools: 5", false},
diff --git a/internal/eval/graders.go b/internal/eval/graders.go
index f1bfa7d..80149fe 100644
--- a/internal/eval/graders.go
+++ b/internal/eval/graders.go
@@ -51,7 +51,13 @@ var graderValidators = map[string]func(GraderSpec) error{
 		return need(g.Expect != "", "needs `expect`")
 	},
 	"text": func(g GraderSpec) error {
-		return need(len(g.Contains) > 0 || len(g.Reject) > 0, "needs `contains` or `reject`")
+		switch {
+		case len(g.Contains) == 0 && len(g.Reject) == 0:
+			return fmt.Errorf("needs `contains` or `reject`")
+		case g.On != "" && g.On != "final_message":
+			return fmt.Errorf("unknown `on` field %q (want final_message)", g.On)
+		}
+		return nil
 	},
 	"tool_sequence": func(g GraderSpec) error {
 		return need(len(g.Sequence) > 0, "needs `sequence`")
@@ -114,13 +120,16 @@ func gradeText(g GraderSpec, e *Evidence) GraderResult {
 }
 
 // evidenceField selects the text body a text grader reads. final_message is the
-// default and only field today; an unknown name falls back to it.
+// default and only field today. An unknown name returns "" (defensive: the
+// manifest validator already rejects unknown `on` values, so a typo fails at
+// load time rather than silently grading the wrong field here). Add new fields
+// to this switch AND to the text validator together.
 func evidenceField(name string, e *Evidence) string {
 	switch name {
 	case "", "final_message":
 		return e.FinalMessage
 	default:
-		return e.FinalMessage
+		return ""
 	}
 }
 
diff --git a/internal/ops/store/migrations/0008_annotations.sql b/internal/ops/store/migrations/0008_annotations.sql
index 75c3cb6..a45659c 100644
--- a/internal/ops/store/migrations/0008_annotations.sql
+++ b/internal/ops/store/migrations/0008_annotations.sql
@@ -7,6 +7,13 @@
 -- skill scopes the verdict to one skill within the session. Append-only: a
 -- session can accrue several verdicts over time (the latest by created_at wins
 -- for consumers that want one).
+--
+-- DELIBERATELY no `REFERENCES session_meta(session_id)` FK: the rederive write
+-- path is `INSERT OR REPLACE INTO session_meta`, and under REPLACE SQLite first
+-- DELETEs the conflicting row — which would fire an ON DELETE CASCADE and wipe
+-- every annotation on each rederive, destroying the durable human verdicts this
+-- table exists to preserve. Cleanup on a genuine session purge is handled
+-- explicitly in store.DeleteSession instead (a deliberate app-layer invariant).
 CREATE TABLE IF NOT EXISTS annotations (
   session_id  TEXT NOT NULL,
   skill       TEXT,

From 4811cfacbc1062fc5ff229bda2060bcd54ef2092 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Wed, 17 Jun 2026 01:03:28 -0400
Subject: [PATCH 08/10] fix(ops): gate honors newest verdict + drop
 ListEvalRuns N+1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 6 greptile findings:

- promote gate: latestPassingEval now clears only when the MOST RECENT eval
  for the locked commit passed — a newer failing run is no longer overridden
  by an older pass, so a known regression can't be promoted past. Pinned by
  TestLatestPassingEval_NewestVerdictWins.
- ListEvalRuns: per-case rows now load only when EvalRunFilter.IncludeCases
  is set, so header-only callers (lineage, the promote gate) skip the N+1.
  WHERE-building and case-loading extracted to helpers (keeps cyclo <=15).
---
 cmd/ops_promote.go                   | 22 +++++-----
 cmd/ops_test.go                      | 50 ++++++++++++++++++++++
 internal/ops/store/eval_runs.go      | 64 +++++++++++++++++++---------
 internal/ops/store/eval_runs_test.go |  2 +-
 4 files changed, 105 insertions(+), 33 deletions(-)

diff --git a/cmd/ops_promote.go b/cmd/ops_promote.go
index 8f42d78..7a902df 100644
--- a/cmd/ops_promote.go
+++ b/cmd/ops_promote.go
@@ -110,24 +110,24 @@ func promoteDecision(rs *resolvedSkill, passing *store.EvalRunRow, force bool, r
 	return d
 }
 
-// latestPassingEval returns the newest passing eval run for the skill's locked
-// commit, or nil when there is none. A skill with no locked commit ("") can't be
-// evidence-gated by commit, so it returns nil (promotion then requires
-// --force-no-eval). A store error is propagated, NOT swallowed — otherwise a
-// transient DB failure would read as "no passing eval" and silently block (or
-// mislead) a CI gate.
+// latestPassingEval returns the commit's eval verdict when it is clear to
+// promote: the MOST RECENT run for the locked commit, but only if that run
+// passed. The freshest verdict governs — a newer failing run is NOT overridden
+// by an older pass, so a known regression can't be promoted past. A skill with
+// no locked commit ("") can't be evidence-gated by commit, so it returns nil
+// (promotion then requires --force-no-eval). A store error is propagated, NOT
+// swallowed — otherwise a transient DB failure would read as "no passing eval"
+// and silently block (or mislead) a CI gate.
 func latestPassingEval(cmd *cobra.Command, s store.Store, rs *resolvedSkill) (*store.EvalRunRow, error) {
 	if rs.Commit == "" {
 		return nil, nil
 	}
-	runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit})
+	runs, err := s.ListEvalRuns(cmd.Context(), &store.EvalRunFilter{SkillName: rs.Name, SkillCommit: rs.Commit, Limit: 1})
 	if err != nil {
 		return nil, err
 	}
-	for _, r := range runs { // newest-first
-		if r.Pass {
-			return r, nil
-		}
+	if len(runs) > 0 && runs[0].Pass { // newest-first; the latest verdict wins
+		return runs[0], nil
 	}
 	return nil, nil
 }
diff --git a/cmd/ops_test.go b/cmd/ops_test.go
index 17fac15..2220552 100644
--- a/cmd/ops_test.go
+++ b/cmd/ops_test.go
@@ -1,14 +1,64 @@
 package cmd
 
 import (
+	"context"
 	"encoding/json"
+	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/astra-sh/qvr/internal/config"
 	"github.com/astra-sh/qvr/internal/ops/store"
+	"github.com/spf13/cobra"
 )
 
+// TestLatestPassingEval_NewestVerdictWins pins the gate semantics: the most
+// recent eval for the locked commit governs, so a newer FAIL is not overridden
+// by an older PASS.
+func TestLatestPassingEval_NewestVerdictWins(t *testing.T) {
+	ctx := context.Background()
+	st, err := store.Open(ctx, store.OpenOptions{Path: filepath.Join(t.TempDir(), "skillops.db")})
+	if err != nil {
+		t.Fatalf("open store: %v", err)
+	}
+	defer st.Close()
+
+	put := func(ts int64, pass bool) {
+		t.Helper()
+		if _, e := st.PutEvalRun(ctx, &store.EvalRunRow{
+			SkillName: "g", SkillCommit: "c1", Suite: "s",
+			StartedAt: time.Unix(ts, 0).UTC(), Pass: pass,
+		}); e != nil {
+			t.Fatalf("put: %v", e)
+		}
+	}
+	rs := &resolvedSkill{Name: "g", Commit: "c1"}
+	cmd := &cobra.Command{}
+	cmd.SetContext(ctx) // cobra.Command.Context() is nil until set
+
+	// older PASS, newer FAIL → gate must NOT clear.
+	put(1000, true)
+	put(2000, false)
+	got, err := latestPassingEval(cmd, st, rs)
+	if err != nil {
+		t.Fatalf("latestPassingEval: %v", err)
+	}
+	if got != nil {
+		t.Errorf("newest verdict is a FAIL; gate must not clear (got run #%d)", got.ID)
+	}
+
+	// a fresh PASS lands → gate clears again.
+	put(3000, true)
+	got, err = latestPassingEval(cmd, st, rs)
+	if err != nil {
+		t.Fatalf("latestPassingEval: %v", err)
+	}
+	if got == nil {
+		t.Error("newest verdict is a PASS; gate should clear")
+	}
+}
+
 // TestOpsEval_NoDatabase pins the helpful error when nothing has been captured.
 func TestOpsEval_NoDatabase(t *testing.T) {
 	isolatedHome(t, true)
diff --git a/internal/ops/store/eval_runs.go b/internal/ops/store/eval_runs.go
index 0be7fc6..404470a 100644
--- a/internal/ops/store/eval_runs.go
+++ b/internal/ops/store/eval_runs.go
@@ -39,6 +39,10 @@ type EvalRunFilter struct {
 	SkillCommit string
 	Since       *time.Time
 	Limit       int
+	// IncludeCases loads each run's per-case verdict rows (an extra query per
+	// run). Off by default so header-only callers (lineage, the promote gate)
+	// don't pay an N+1 they never read.
+	IncludeCases bool
 }
 
 // PutEvalRun inserts a run and its case rows in one tx, returning the new run id.
@@ -79,25 +83,11 @@ func (s *sqliteStore) PutEvalRun(ctx context.Context, r *EvalRunRow) (int64, err
 	return id, nil
 }
 
-// ListEvalRuns returns runs matching the filter, newest-first, each with its
-// case rows loaded.
+// ListEvalRuns returns runs matching the filter, newest-first. Per-case rows are
+// loaded only when the filter sets IncludeCases (header-only callers skip the
+// N+1).
 func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*EvalRunRow, error) {
-	var where []string
-	var args []any
-	if f != nil {
-		if f.SkillName != "" {
-			where = append(where, "skill_name = ?")
-			args = append(args, f.SkillName)
-		}
-		if f.SkillCommit != "" {
-			where = append(where, "skill_commit = ?")
-			args = append(args, f.SkillCommit)
-		}
-		if f.Since != nil {
-			where = append(where, "started_at >= ?")
-			args = append(args, f.Since.UTC())
-		}
-	}
+	where, args := evalRunWhere(f)
 	q := `SELECT id, skill_name, skill_commit, suite, session_id, started_at, passed, failed, pass FROM eval_runs`
 	if len(where) > 0 {
 		q += " WHERE " + strings.Join(where, " AND ")
@@ -128,14 +118,46 @@ func (s *sqliteStore) ListEvalRuns(ctx context.Context, f *EvalRunFilter) ([]*Ev
 	if err := rows.Err(); err != nil {
 		return nil, err
 	}
-	for _, r := range out {
+	if f != nil && f.IncludeCases {
+		if err := s.attachEvalCases(ctx, out); err != nil {
+			return nil, err
+		}
+	}
+	return out, nil
+}
+
+// evalRunWhere builds the WHERE clauses and args for an eval-run filter.
+func evalRunWhere(f *EvalRunFilter) ([]string, []any) {
+	var where []string
+	var args []any
+	if f == nil {
+		return where, args
+	}
+	if f.SkillName != "" {
+		where = append(where, "skill_name = ?")
+		args = append(args, f.SkillName)
+	}
+	if f.SkillCommit != "" {
+		where = append(where, "skill_commit = ?")
+		args = append(args, f.SkillCommit)
+	}
+	if f.Since != nil {
+		where = append(where, "started_at >= ?")
+		args = append(args, f.Since.UTC())
+	}
+	return where, args
+}
+
+// attachEvalCases loads each run's per-case verdict rows in place.
+func (s *sqliteStore) attachEvalCases(ctx context.Context, runs []*EvalRunRow) error {
+	for _, r := range runs {
 		cases, err := s.evalCases(ctx, r.ID)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		r.Cases = cases
 	}
-	return out, nil
+	return nil
 }
 
 func (s *sqliteStore) evalCases(ctx context.Context, runID int64) ([]EvalCaseRow, error) {
diff --git a/internal/ops/store/eval_runs_test.go b/internal/ops/store/eval_runs_test.go
index b3da2e7..53d08cc 100644
--- a/internal/ops/store/eval_runs_test.go
+++ b/internal/ops/store/eval_runs_test.go
@@ -34,7 +34,7 @@ func TestEvalRuns_RoundTrip(t *testing.T) {
 		t.Fatal("expected a non-zero run id")
 	}
 
-	all, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue"})
+	all, err := st.ListEvalRuns(ctx, &EvalRunFilter{SkillName: "triage-issue", IncludeCases: true})
 	if err != nil {
 		t.Fatalf("list: %v", err)
 	}

From f7765d25e55d924ed2014f659f81247debb59428 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Wed, 17 Jun 2026 01:19:40 -0400
Subject: [PATCH 09/10] chore(ops): drop dead OutcomeUnknown const + document
 eval_runs no-FK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 7 greptile findings (style/docs only):

- Remove the unused OutcomeUnknown constant — "unknown" is the ABSENCE of
  the attribute (""), and every comparison site already checks "", so the
  sentinel was dead and potentially misleading.
- migration 0009: document why eval_runs.session_id deliberately omits a FK
  (durable verdict outlives the session; DeleteSession NULLs the pointer),
  mirroring the rationale comment migration 0008 carries.
---
 internal/ops/derive/span.go                      | 4 +++-
 internal/ops/store/migrations/0009_eval_runs.sql | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/internal/ops/derive/span.go b/internal/ops/derive/span.go
index 5724c41..178a188 100644
--- a/internal/ops/derive/span.go
+++ b/internal/ops/derive/span.go
@@ -44,11 +44,13 @@ const (
 // read model rolls these up into SessionMeta.Outcome (worst-of-spans).
 const OutcomeKey = "qvr.outcome"
 
+// The three outcome values a span can carry. "Unknown" is NOT a value — it is
+// the ABSENCE of the attribute (and the empty session rollup), so every
+// comparison site checks against "" rather than a sentinel string.
 const (
 	OutcomeSuccess = "success"
 	OutcomeFailure = "failure"
 	OutcomeBlocked = "blocked"
-	OutcomeUnknown = "unknown"
 )
 
 // classifyOutcome maps a tool result to an outcome. A non-error result is a
diff --git a/internal/ops/store/migrations/0009_eval_runs.sql b/internal/ops/store/migrations/0009_eval_runs.sql
index de12808..1075658 100644
--- a/internal/ops/store/migrations/0009_eval_runs.sql
+++ b/internal/ops/store/migrations/0009_eval_runs.sql
@@ -3,12 +3,17 @@
 -- skill's evals.yaml; the verdict is pinned to the exact locked commit that
 -- ran, so lineage can show "at commit abc the safety suite failed; at def it
 -- passed". Per-case rows carry the granular pass/fail + reason.
+-- session_id DELIBERATELY has no FK to session_meta: an eval_run is durable
+-- lineage evidence keyed by {skill, commit} and OUTLIVES the session it graded.
+-- store.DeleteSession NULLs this pointer on a genuine purge (keeping the verdict,
+-- dropping the stale id) rather than cascading the verdict away — the same
+-- app-layer-invariant choice migration 0008 makes for annotations.
 CREATE TABLE IF NOT EXISTS eval_runs (
   id            INTEGER PRIMARY KEY AUTOINCREMENT,
   skill_name    TEXT NOT NULL,
   skill_commit  TEXT NOT NULL,
   suite         TEXT NOT NULL,         -- requested suite, or '*' for all
-  session_id    TEXT,                  -- the graded session
+  session_id    TEXT,                  -- the graded session (nullable; see note above)
   started_at    DATETIME NOT NULL,
   passed        INTEGER NOT NULL,
   failed        INTEGER NOT NULL,

From dfa5cbfd5e93ce053a266efce1e8606c83977098 Mon Sep 17 00:00:00 2001
From: Rakshith <sairakshith977@gmail.com>
Date: Wed, 17 Jun 2026 01:29:17 -0400
Subject: [PATCH 10/10] docs(evolve-skill): verified install in CI recipe, not
 curl|sh

Round 8 greptile finding (security): the CI recipe piped
`curl -fsSL https://quiver.sh/install | sh`, so an adopter copying it
verbatim would execute unauthenticated network content in the runner with
full secrets access. Replace both jobs' install step with a pinned
`go install github.com/astra-sh/qvr@<tag>`, verified through the Go module
checksum database.
---
 skills/evolve-skill/references/ci-recipe.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/skills/evolve-skill/references/ci-recipe.md b/skills/evolve-skill/references/ci-recipe.md
index 3dd69bc..453687e 100644
--- a/skills/evolve-skill/references/ci-recipe.md
+++ b/skills/evolve-skill/references/ci-recipe.md
@@ -18,7 +18,13 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - run: curl -fsSL https://quiver.sh/install | sh    # install qvr
+      - uses: actions/setup-go@v5
+        with: { go-version: '1.25' }
+      # Install a PINNED version, verified through the Go module checksum
+      # database — unlike `curl … | sh`, which executes unauthenticated network
+      # content in the runner (a MITM / compromised CDN / DNS hijack would run
+      # arbitrary code with full CI secrets access). Pin the tag you trust.
+      - run: go install github.com/astra-sh/qvr@v0.28.1
       - run: qvr audit enable && qvr audit discover
       # Grade each skill that ships an evals.yaml; non-zero exit fails the job.
       - run: |