Skip to content
Draft
109 changes: 109 additions & 0 deletions cmd/acceptance_behavioral_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package cmd

import (
"encoding/json"
"os"
"path/filepath"
"testing"

"github.com/astra-sh/qvr/internal/config"
)

// TestAcceptance_BehavioralGate exercises the loop on a DIFFERENT dimension than
// the triage label-match: a skill that must verify its edits by running tests.
// The gate here is behavioral — tool sequence, tool constraints, skill
// invocation, efficiency — so it proves the eval substrate generalizes past
// simple text matching. First run edits without testing (fails); the corrected
// run edits THEN tests (passes).
func TestAcceptance_BehavioralGate(t *testing.T) {
isolatedHome(t, true)
cfg, err := config.Load()
if err != nil {
t.Fatalf("load config: %v", err)
}
skillDir := writeGuardTestsFixture(t)

// 1. A run that edited code but never ran the tests (Read, Edit — no Bash).
badID := seedSession(t, cfg, sessionSeed{
StartedMs: 1000, Skill: "guard-tests", FinalMsg: "Applied the change.",
Outcome: "success", Tools: []string{"Read", "Edit"},
})

// 2. Reviewer flags the unverified change.
if _, stderr, err := runRoot(t, nil, "audit", "annotate", badID,
"--skill", "guard-tests", "--outcome", "bad", "--note", "shipped without running tests"); err != nil {
t.Fatalf("annotate: err=%v stderr=%q", err, stderr)
}

// 3. Gate FAILS: tool_sequence Edit→Bash unsatisfied, Bash missing.
if _, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests",
"--skill-dir", skillDir, "--output", "json"); err == nil {
t.Fatal("expected the baseline eval to FAIL for an unverified change")
}

// 4. Corrected run: Read, Edit, THEN Bash (runs the tests).
seedSession(t, cfg, sessionSeed{
StartedMs: 2000, Skill: "guard-tests", FinalMsg: "Applied the change and ran the tests.",
Outcome: "success", Tools: []string{"Read", "Edit", "Bash"},
})

// 5. Gate PASSES.
out, _, err := runRoot(t, nil, "ops", "eval", "run", "guard-tests",
"--skill-dir", skillDir, "--output", "json")
if err != nil {
t.Fatalf("expected the post-fix eval to PASS, got err=%v", err)
}
var res struct {
Pass bool `json:"pass"`
Failed int `json:"failed"`
}
if e := json.Unmarshal([]byte(out), &res); e != nil {
t.Fatalf("decode eval json: %v\n%s", e, out)
}
if !res.Pass || res.Failed != 0 {
t.Fatalf("post-fix eval = %+v, want pass with 0 failures", res)
}
}

// writeGuardTestsFixture creates a fixture skill whose suite asserts the skill
// fired, edits were followed by a test run, and no network was used — all
// behavioral graders over the captured trace.
func writeGuardTestsFixture(t *testing.T) string {
t.Helper()
dir := filepath.Join(t.TempDir(), "guard-tests")
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatal(err)
}
skill := `---
name: guard-tests
description: Edits code and always verifies the change by running the test suite.
metadata:
author: quiver-playground
version: "1.0.0"
---

# Guard tests

After editing code, always run the test suite to verify the change before
reporting done. Never ship an edit you have not verified.
`
evals := `version: 1
suites:
- name: verifies-changes
cases:
- name: edits-then-runs-tests
graders:
- type: skill_invocation
expectSkills: ["guard-tests"]
- type: tool_sequence
sequence: ["Edit", "Bash"]
- type: tool_constraint
expectTools: ["Bash"]
rejectTools: ["WebFetch"]
- type: behavior
maxTools: 10
`
mustWrite(t, filepath.Join(dir, "SKILL.md"), skill)
mustWrite(t, filepath.Join(dir, "evals.yaml"), evals)
return dir
}
213 changes: 213 additions & 0 deletions cmd/acceptance_loop_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package cmd

import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"

"github.com/astra-sh/qvr/internal/config"
"github.com/astra-sh/qvr/internal/ops"
"github.com/astra-sh/qvr/internal/ops/store"
"github.com/google/uuid"
)

// TestAcceptance_SelfImprovementLoop recreates the article's triage loop end to
// end through the real CLI: a skill mis-triages an issue, a human flags it, the
// eval gate FAILS, the skill is "improved" (a corrected run), and the gate now
// PASSES — with the whole arc visible in `qvr ops lineage`. The fail→pass is the
// branch's acceptance criterion.
func TestAcceptance_SelfImprovementLoop(t *testing.T) {
isolatedHome(t, true)
cfg, err := config.Load()
if err != nil {
t.Fatalf("load config: %v", err)
}
skillDir := writeTriageFixture(t)

// 1. INNER LOOP: a real run that mis-triaged the issue (the article's
// "ready-to-implement" mistake). Captured + skill-attributed.
badID := seedTriageSession(t, cfg, 1000, "Labeled this issue ready-to-implement.")

// 2. HUMAN FEEDBACK: the reviewer flips the verdict and says why.
if _, stderr, err := runRoot(t, nil, "audit", "annotate", badID,
"--skill", "triage-issue", "--outcome", "bad", "--note", "ambiguous — needs a setting, should be needs-info"); err != nil {
t.Fatalf("annotate: err=%v stderr=%q", err, stderr)
}

// 3. BASELINE GATE: the eval must FAIL on the misclassification.
if _, _, err := runRoot(t, nil, "ops", "eval", "run", "triage-issue",
"--skill-dir", skillDir, "--suite", "triage-correctness", "--output", "json"); err == nil {
t.Fatal("expected the baseline eval to FAIL on the misclassified run")
}

// 4. IMPROVE: the loop edits the skill and re-runs it; the corrected run is
// captured (newer, so it is the most-recent session the eval grades).
seedTriageSession(t, cfg, 2000, "Labeled this issue needs-info pending a decision.")

// 5. POST-FIX GATE: the same eval must now PASS.
out, _, err := runRoot(t, nil, "ops", "eval", "run", "triage-issue",
"--skill-dir", skillDir, "--suite", "triage-correctness", "--output", "json")
if err != nil {
t.Fatalf("expected the post-fix eval to PASS, got err=%v", err)
}
var res struct {
Pass bool `json:"pass"`
Passed int `json:"passed"`
Failed int `json:"failed"`
}
if e := json.Unmarshal([]byte(out), &res); e != nil {
t.Fatalf("decode eval json: %v\n%s", e, out)
}
if !res.Pass || res.Failed != 0 {
t.Fatalf("post-fix eval = %+v, want pass with 0 failures", res)
}

// 6. LINEAGE: the timeline shows the fail, then the pass, plus the verdict.
lo, _, err := runRoot(t, nil, "ops", "lineage", "triage-issue", "--output", "json")
if err != nil {
t.Fatalf("lineage: %v", err)
}
assertFailPassArc(t, lo)
}

// assertFailPassArc checks the lineage timeline carries the article's full arc:
// at least one failed eval, one passed eval, and one human annotation.
func assertFailPassArc(t *testing.T, lineageJSON string) {
t.Helper()
var timeline []struct {
Kind string `json:"kind"`
Pass *bool `json:"pass"`
}
if e := json.Unmarshal([]byte(lineageJSON), &timeline); e != nil {
t.Fatalf("decode lineage json: %v\n%s", e, lineageJSON)
}
evalPass, evalFail, annotations := 0, 0, 0
for _, e := range timeline {
switch {
case e.Kind == "annotation":
annotations++
case e.Pass != nil && *e.Pass:
evalPass++
default:
evalFail++
}
}
if evalPass < 1 || evalFail < 1 || annotations < 1 {
t.Errorf("lineage missing the fail→pass arc: %d pass, %d fail, %d annotations", evalPass, evalFail, annotations)
}
}

// writeTriageFixture creates a fixture triage-issue skill with an evals.yaml
// whose correctness suite distinguishes the right label from the wrong one.
func writeTriageFixture(t *testing.T) string {
t.Helper()
dir := filepath.Join(t.TempDir(), "triage-issue")
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatal(err)
}
skill := `---
name: triage-issue
description: Sorts incoming issues into ready-to-implement, duplicate, or needs-info.
metadata:
author: quiver-playground
version: "1.0.0"
---

# Triage an issue

Classify each incoming issue into exactly one bucket: ready-to-implement,
duplicate, or needs-info. When a feature request leaves an ambiguity (e.g.
whether to add a setting), prefer needs-info over ready-to-implement.
`
evals := `version: 1
suites:
- name: triage-correctness
cases:
- name: ambiguous-feature-needs-info
graders:
- type: outcome
expect: success
- type: text
on: final_message
contains: ["needs-info"]
reject: ["ready-to-implement"]
`
mustWrite(t, filepath.Join(dir, "SKILL.md"), skill)
mustWrite(t, filepath.Join(dir, "evals.yaml"), evals)
return dir
}

func mustWrite(t *testing.T, path, content string) {
t.Helper()
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatalf("write %s: %v", path, err)
}
}

// seedTriageSession seeds a successful triage-issue run ending with finalMsg,
// using a single Bash tool. Thin wrapper over seedSession.
func seedTriageSession(t *testing.T, cfg *config.Config, startedMs int64, finalMsg string) string {
t.Helper()
return seedSession(t, cfg, sessionSeed{
StartedMs: startedMs, Skill: "triage-issue", FinalMsg: finalMsg,
Outcome: "success", Tools: []string{"Bash"},
})
}

// sessionSeed describes a synthetic captured session to inject straight into the
// audit store (bypassing agent-store discovery), so a test controls exactly the
// evidence the graders read: the skill that fired, the ordered tool calls, the
// session outcome, and the final assistant message.
type sessionSeed struct {
StartedMs int64
Skill string
FinalMsg string
Outcome string
Tools []string // ordered tool names (TOOL spans), after the SKILL span
}

// seedSession writes the synthetic session and returns its id.
func seedSession(t *testing.T, cfg *config.Config, seed sessionSeed) string {
t.Helper()
ctx := context.Background()
s, err := store.Open(ctx, store.OpenOptions{Path: ops.DBPath(cfg)})
if err != nil {
t.Fatalf("open store: %v", err)
}
defer s.Close()

sid := uuid.New()
outMsgs, _ := json.Marshal([]map[string]string{{"role": "assistant", "content": seed.FinalMsg}})
tid := "trace-" + sid.String()[:8]
end := seed.StartedMs + 1000

spans := []*store.SpanRow{
{SpanID: tid + "-llm", TraceID: tid, SessionID: sid, AgentName: "claude-code",
Kind: "LLM", Name: "chat", StartMs: seed.StartedMs, EndMs: end,
Attributes: fmt.Sprintf(`{"gen_ai.output.messages":%q}`, string(outMsgs))},
{SpanID: tid + "-skill", TraceID: tid, SessionID: sid, AgentName: "claude-code",
Kind: "SKILL", Name: "execute_tool Skill", StartMs: seed.StartedMs + 50, EndMs: seed.StartedMs + 100,
Attributes: fmt.Sprintf(`{"gen_ai.tool.name":"Skill","skill.name":%q}`, seed.Skill)},
}
for i, tool := range seed.Tools {
at := seed.StartedMs + int64(100*(i+2))
spans = append(spans, &store.SpanRow{
SpanID: fmt.Sprintf("%s-tool%d", tid, i), TraceID: tid, SessionID: sid, AgentName: "claude-code",
Kind: "TOOL", Name: "execute_tool " + tool, StartMs: at, EndMs: at + 50,
Attributes: fmt.Sprintf(`{"gen_ai.tool.name":%q,"qvr.outcome":"success"}`, tool),
})
}
meta := &store.SessionMetaRow{
SessionID: sid, AgentName: "claude-code", Model: "claude-opus-4-8",
Title: seed.Skill + " run", StartedMs: seed.StartedMs, EndedMs: end,
Turns: 1, Tools: int64(len(seed.Tools)), Skills: []string{seed.Skill}, Outcome: seed.Outcome,
DeriverVersion: 8,
}
if err := s.ReplaceSessionDerivation(ctx, meta, spans); err != nil {
t.Fatalf("seed session: %v", err)
}
return sid.String()
}
8 changes: 5 additions & 3 deletions cmd/audit.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ instantly). Each session's verbatim trace lands in a local SQLite database,
attributed to the exact locked skill version that ran. Query it with
'qvr audit logs' / 'qvr audit sessions'.

The everyday surface is enable/disable, discover, status, sessions, logs, and
export. The remaining subcommands (ingest, raw, spans, rederive, gc) are
low-level plumbing the maintenance paths use and are hidden from this list.`,
The everyday surface is enable/disable, discover, status, sessions, logs,
export, and annotate/annotations (record and read human verdicts on what a
skill actually did — the feedback a self-improvement loop reads). The remaining
subcommands (ingest, raw, spans, rederive, gc) are low-level plumbing the
maintenance paths use and are hidden from this list.`,
// Reject a typo'd subcommand (`qvr audit enabel`) with a non-zero exit
// instead of silently printing help (issue #169 — the #120 fix missed this
// parent). No args still prints help.
Expand Down
Loading