diff --git a/docs/harness-subsystems.md b/docs/harness-subsystems.md index c553712..386f2a4 100644 --- a/docs/harness-subsystems.md +++ b/docs/harness-subsystems.md @@ -34,6 +34,53 @@ scope check on the raw arg instead of the normalized written path. **Checklist** every mutating tool emits `mutated`/`edit`/`create` (cross-check the `tools` table); rejects emit nothing; `countsAsMutation` exempts only `package.json`. +## loop / repair + snapshot — `src/loop/file-snapshot.ts`, `src/loop/quality.ts`, `src/loop/review-repair.ts` + +"Try an edit, keep only if it helps" loops: snapshot the editable scope → let the +agent edit → re-gate (and for quality, re-judge) → keep only on improvement, else +roll back. The revert is the load-bearing safety property. + +**Invariants** +- Snapshot/restore is glob-aware: `task.files` is documented as GLOBS, so anything + that walks the scope must expand them (`snapshotFiles`, `score`, `scopeCode` all + go through the shared walker), never read a literal glob path. +- Restore rewrites every content-backed file AND tombstones any file the attempt + created (binary-inclusive, uncapped scan — a created `.svg`/image must be deleted, + or "reverted" lies). Pre-existing files survive. +- A `reverted` event carries `count` = the batch's mutation count (so accept-rate + subtracts the whole batch, not 1). +- A throw mid-repair still restores before rethrowing (no half-applied batch on disk). + +**Risk areas** a scope walker that reads `task.files` literally (ENOENT on a glob — +fixed in `score`); content cap (128 KiB) silently NOT restoring an edited oversize/ +binary file while still emitting `reverted`; a created dir left behind (cosmetic). + +**Checklist** `tests/file-snapshot.test.ts` (glob expand, tombstone, binary asset), +`tests/quality.test.ts` (glob scope, gate-break revert), `tests/review-repair.test.ts` +(throw-restores, batch `count`). + +## loop / greenfield — `src/loop/greenfield/*` + +Filesystem-state outer loop: a `features.json` checklist drives an implement→evaluate +→persist cycle per feature until all green or a feature exhausts its attempts. + +**Invariants** +- Feature ids come from the model and become path components (`contracts/.md`), + so they're validated kebab (`isFeatureId`) at parse/load AND `basename`-guarded at + the write point (defence in depth against `../` traversal). +- State persists after every attempt (resume-first: an interrupted run picks up from + the last verified feature; a feature loaded at `attempts>=max` is `stuck`, never re-run). +- The evaluator is layered + short-circuits gate → browser → judge; the gate stays + the authority, the browser layer is skip-tolerant, the judge is reject-by-default + and trace-blind (design-rule #2: it sees the built artifact, never the generator trace). +- Contract negotiation is OFF unless `TSFORGE_CONTRACT` is set. + +**Risk areas** an unsafe id slipping past `isFeatureId`; an exhausted feature wedging +the loop; the judge seeing the generator's trace. + +**Checklist** `tests/greenfield.test.ts`, `tests/greenfield-planner.test.ts` +(unsafe-id drop), `tests/greenfield-contract.test.ts` (path-escape → `feature.md`). + ## tools — `src/loop/tools/*` Tool handlers + dispatch. Handlers return a `string` (model feedback); mutations are diff --git a/packages/core/src/loop/quality.ts b/packages/core/src/loop/quality.ts index ff717ba..093058f 100644 --- a/packages/core/src/loop/quality.ts +++ b/packages/core/src/loop/quality.ts @@ -1,9 +1,9 @@ -import { join } from "node:path"; import type { ITask } from "../spec"; import type { IAgent } from "../agent"; import type { IProvider } from "../inference"; import { validate, type ErrorParser } from "../validate"; import { runAccept } from "../validate"; +import { readFiles } from "../lib/fs"; import { judge } from "../eval"; import { qualityHints } from "./feedback"; import { snapshotFiles, restoreFiles } from "./file-snapshot"; @@ -157,12 +157,22 @@ async function score( judgeProvider: IProvider, meta: IQualityMeta ): Promise { - let code = ""; - - for (const file of task.files) { - code += `// ${file}\n${await Bun.file(join(cwd, file)).text()}\n\n`; + // Expand the editable scope through the shared walker (globs, dedupe, size-cap) + // — the same path `scopeCode`/`snapshotFiles` take. A literal per-file read here + // would throw ENOENT on a glob scope (e.g. `src/**/*.ts`), which `ITask.files` + // explicitly allows. + const views = await readFiles(cwd, task.files); + + // Nothing resolved (an empty glob match, or every file over the size cap): there + // is no artifact to assess. Return the floor WITHOUT calling the judge — an empty + // code window scores unpredictably and burns an LLM call. Degrade, never throw: + // quality repair is a best-effort post-green pass, not a gate. + if (views.length === 0) { + return { quality: 0, notes: "no files in scope to judge" }; } + const code = views.map((v) => `// ${v.path}\n${v.content}\n`).join("\n"); + const result = await judge(judgeProvider, { goal: meta.goal, criteria: meta.criteria, diff --git a/packages/core/tests/quality.test.ts b/packages/core/tests/quality.test.ts index eae315d..c9d9a1b 100644 --- a/packages/core/tests/quality.test.ts +++ b/packages/core/tests/quality.test.ts @@ -60,6 +60,96 @@ test("drives quality up, keeping improvements, until target", async () => { } }); +// Regression: `ITask.files` is documented as "editable scope GLOBS". `score()` +// used to read each entry literally (`Bun.file(join(cwd, "src/**/*.ts")).text()`), +// which throws ENOENT on a glob — crashing the whole repair before the first +// judge call. It must expand globs through the shared walker, like `scopeCode`. +test("scores a GLOB-scoped task without throwing (globs are expanded)", async () => { + const dir = await mkdtemp(join(tmpdir(), "tsforge-quality-glob-")); + + try { + await Bun.write(join(dir, "src", "a.ts"), "export const a = 1;"); + await Bun.write(join(dir, "src", "b.ts"), "export const b = 2;"); + + const seen: string[] = []; + const judgeProvider: IProvider = { + async complete(messages) { + seen.push(messages.map((m) => m.content).join("\n")); + + return { + content: JSON.stringify({ + overall: 5, + correctness: 5, + design: 5, + readability: 5, + notes: "great", + }), + toolCalls: [], + }; + }, + }; + const agent: IAgent = { + async implement() { + // already at target; never invoked + }, + }; + + const result = await qualityRepair( + { id: "1", accept: "true", files: ["src/**/*.ts"] }, + dir, + agent, + judgeProvider, + { goal: "g", criteria: "c" }, + { target: 5, maxAttempts: 1 } + ); + + expect(result.quality).toBe(5); + // The judge actually saw both glob-matched files' contents. + expect(seen[0]).toContain("export const a = 1;"); + expect(seen[0]).toContain("export const b = 2;"); + } finally { + await rm(dir, { recursive: true, force: true }); + } +}); + +// Regression: an empty scope (glob matched nothing / all over the size cap) must +// NOT send an empty code window to the judge — it scores unpredictably and wastes +// an LLM call. score() returns the floor without calling the provider. +test("an empty scope returns the floor without calling the judge", async () => { + const dir = await mkdtemp(join(tmpdir(), "tsforge-quality-empty-")); + + try { + let judgeCalls = 0; + const judgeProvider: IProvider = { + async complete() { + judgeCalls += 1; + + return { content: JSON.stringify({ overall: 5 }), toolCalls: [] }; + }, + }; + const agent: IAgent = { + async implement() { + // no-op: nothing to edit when the scope is empty + }, + }; + + const result = await qualityRepair( + { id: "1", accept: "true", files: ["does/not/exist/**/*.ts"] }, + dir, + agent, + judgeProvider, + { goal: "g", criteria: "c" }, + { target: 5, maxAttempts: 2 } + ); + + expect(judgeCalls).toBe(0); // never judged an empty window — the point of the guard + expect(result.quality).toBe(0); // floor, not an unpredictable empty-window score + expect(result.notes).toContain("no files in scope"); + } finally { + await rm(dir, { recursive: true, force: true }); + } +}); + test("reverts an attempt that breaks the gate", async () => { const dir = await mkdtemp(join(tmpdir(), "tsforge-quality-"));