diff --git a/evals/corpus/fixtures/fixtures.spec.md b/evals/corpus/fixtures/fixtures.spec.md new file mode 100644 index 0000000..9d34991 --- /dev/null +++ b/evals/corpus/fixtures/fixtures.spec.md @@ -0,0 +1,19 @@ +--- +id: fixtures +title: Entity fixture factories (one module per entity) +verify: bun test +mode: scratch +--- + +## Acceptance criteria + +A1. For each of the six entities — `user`, `order`, `product`, `invoice`, `payment`, `shipment` — a module `.ts` exports a factory named `make` (e.g. `user.ts` → `makeUser`, `order.ts` → `makeOrder`). Each returns `IEntity`, imported from `./types` (`{ id: string; kind: string }`). + +A2. The factory returns an object whose `kind` is the entity name (e.g. `"user"`) and whose `id` is a non-empty string. The six modules are identical in shape — only the name and the `kind` value differ. + +## Tasks + +1. [fixtures] Create the six entity fixture modules + accept: bun test fixtures.test.ts + files: user.ts, order.ts, product.ts, invoice.ts, payment.ts, shipment.ts + context: fixtures.test.ts, types.ts diff --git a/evals/corpus/fixtures/fixtures.test.ts b/evals/corpus/fixtures/fixtures.test.ts new file mode 100644 index 0000000..b155c5c --- /dev/null +++ b/evals/corpus/fixtures/fixtures.test.ts @@ -0,0 +1,27 @@ +import { test, expect } from "bun:test"; +import type { IEntity } from "./types"; +import { makeUser } from "./user"; +import { makeOrder } from "./order"; +import { makeProduct } from "./product"; +import { makeInvoice } from "./invoice"; +import { makePayment } from "./payment"; +import { makeShipment } from "./shipment"; + +const cases: ReadonlyArray<[string, () => IEntity]> = [ + ["user", makeUser], + ["order", makeOrder], + ["product", makeProduct], + ["invoice", makeInvoice], + ["payment", makePayment], + ["shipment", makeShipment], +]; + +for (const [kind, make] of cases) { + test(`make ${kind} returns a tagged entity with a non-empty id`, () => { + const e = make(); + + expect(e.kind).toBe(kind); + expect(typeof e.id).toBe("string"); + expect(e.id.length).toBeGreaterThan(0); + }); +} diff --git a/evals/corpus/fixtures/invoice.ts b/evals/corpus/fixtures/invoice.ts new file mode 100644 index 0000000..3530b52 --- /dev/null +++ b/evals/corpus/fixtures/invoice.ts @@ -0,0 +1,5 @@ +import type { IEntity } from "./types"; + +export function makeInvoice(): IEntity { + return { id: "invoice-1", kind: "invoice" }; +} diff --git a/evals/corpus/fixtures/order.ts b/evals/corpus/fixtures/order.ts new file mode 100644 index 0000000..9192894 --- /dev/null +++ b/evals/corpus/fixtures/order.ts @@ -0,0 +1,5 @@ +import type { IEntity } from "./types"; + +export function makeOrder(): IEntity { + return { id: "order-1", kind: "order" }; +} diff --git a/evals/corpus/fixtures/payment.ts b/evals/corpus/fixtures/payment.ts new file mode 100644 index 0000000..94a0ea7 --- /dev/null +++ b/evals/corpus/fixtures/payment.ts @@ -0,0 +1,5 @@ +import type { IEntity } from "./types"; + +export function makePayment(): IEntity { + return { id: "payment-1", kind: "payment" }; +} diff --git a/evals/corpus/fixtures/product.ts b/evals/corpus/fixtures/product.ts new file mode 100644 index 0000000..bfd0e15 --- /dev/null +++ b/evals/corpus/fixtures/product.ts @@ -0,0 +1,5 @@ +import type { IEntity } from "./types"; + +export function makeProduct(): IEntity { + return { id: "product-1", kind: "product" }; +} diff --git a/evals/corpus/fixtures/shipment.ts b/evals/corpus/fixtures/shipment.ts new file mode 100644 index 0000000..29cbc86 --- /dev/null +++ b/evals/corpus/fixtures/shipment.ts @@ -0,0 +1,5 @@ +import type { IEntity } from "./types"; + +export function makeShipment(): IEntity { + return { id: "shipment-1", kind: "shipment" }; +} diff --git a/evals/corpus/fixtures/types.ts b/evals/corpus/fixtures/types.ts new file mode 100644 index 0000000..840d06e --- /dev/null +++ b/evals/corpus/fixtures/types.ts @@ -0,0 +1,4 @@ +export interface IEntity { + id: string; + kind: string; +} diff --git a/evals/corpus/fixtures/user.ts b/evals/corpus/fixtures/user.ts new file mode 100644 index 0000000..9863587 --- /dev/null +++ b/evals/corpus/fixtures/user.ts @@ -0,0 +1,5 @@ +import type { IEntity } from "./types"; + +export function makeUser(): IEntity { + return { id: "user-1", kind: "user" }; +} diff --git a/evals/corpus/handlers/created.ts b/evals/corpus/handlers/created.ts new file mode 100644 index 0000000..430ffb8 --- /dev/null +++ b/evals/corpus/handlers/created.ts @@ -0,0 +1,3 @@ +export function handleCreated(): { status: number; body: string } { + return { status: 201, body: "created" }; +} diff --git a/evals/corpus/handlers/gone.ts b/evals/corpus/handlers/gone.ts new file mode 100644 index 0000000..f14e4b4 --- /dev/null +++ b/evals/corpus/handlers/gone.ts @@ -0,0 +1,3 @@ +export function handleGone(): { status: number; body: string } { + return { status: 410, body: "gone" }; +} diff --git a/evals/corpus/handlers/handlers.spec.md b/evals/corpus/handlers/handlers.spec.md new file mode 100644 index 0000000..bfa94ba --- /dev/null +++ b/evals/corpus/handlers/handlers.spec.md @@ -0,0 +1,25 @@ +--- +id: handlers +title: Route handlers (one module per route) +verify: bun test +mode: scratch +--- + +## Acceptance criteria + +Each route is its own module exporting `handle(): { status: number; body: string }`. All seven share the same shape — only the status and body differ. + +A1. `health.ts` → `handleHealth` → `{ status: 200, body: "ok" }` +A2. `version.ts` → `handleVersion` → `{ status: 200, body: "v1" }` +A3. `ping.ts` → `handlePing` → `{ status: 200, body: "pong" }` +A4. `teapot.ts` → `handleTeapot` → `{ status: 418, body: "teapot" }` +A5. `notFound.ts` → `handleNotFound` → `{ status: 404, body: "not found" }` +A6. `gone.ts` → `handleGone` → `{ status: 410, body: "gone" }` +A7. `created.ts` → `handleCreated` → `{ status: 201, body: "created" }` + +## Tasks + +1. [handlers] Create the seven route handler modules + accept: bun test handlers.test.ts + files: health.ts, version.ts, ping.ts, teapot.ts, notFound.ts, gone.ts, created.ts + context: handlers.test.ts diff --git a/evals/corpus/handlers/handlers.test.ts b/evals/corpus/handlers/handlers.test.ts new file mode 100644 index 0000000..83aeeb5 --- /dev/null +++ b/evals/corpus/handlers/handlers.test.ts @@ -0,0 +1,29 @@ +import { test, expect } from "bun:test"; +import { handleHealth } from "./health"; +import { handleVersion } from "./version"; +import { handlePing } from "./ping"; +import { handleTeapot } from "./teapot"; +import { handleNotFound } from "./notFound"; +import { handleGone } from "./gone"; +import { handleCreated } from "./created"; + +interface IReply { + status: number; + body: string; +} + +const cases: ReadonlyArray<[() => IReply, number, string]> = [ + [handleHealth, 200, "ok"], + [handleVersion, 200, "v1"], + [handlePing, 200, "pong"], + [handleTeapot, 418, "teapot"], + [handleNotFound, 404, "not found"], + [handleGone, 410, "gone"], + [handleCreated, 201, "created"], +]; + +for (const [handle, status, body] of cases) { + test(`${body} handler returns ${String(status)}`, () => { + expect(handle()).toEqual({ status, body }); + }); +} diff --git a/evals/corpus/handlers/health.ts b/evals/corpus/handlers/health.ts new file mode 100644 index 0000000..c709688 --- /dev/null +++ b/evals/corpus/handlers/health.ts @@ -0,0 +1,3 @@ +export function handleHealth(): { status: number; body: string } { + return { status: 200, body: "ok" }; +} diff --git a/evals/corpus/handlers/notFound.ts b/evals/corpus/handlers/notFound.ts new file mode 100644 index 0000000..a936c80 --- /dev/null +++ b/evals/corpus/handlers/notFound.ts @@ -0,0 +1,3 @@ +export function handleNotFound(): { status: number; body: string } { + return { status: 404, body: "not found" }; +} diff --git a/evals/corpus/handlers/ping.ts b/evals/corpus/handlers/ping.ts new file mode 100644 index 0000000..cb5bd05 --- /dev/null +++ b/evals/corpus/handlers/ping.ts @@ -0,0 +1,3 @@ +export function handlePing(): { status: number; body: string } { + return { status: 200, body: "pong" }; +} diff --git a/evals/corpus/handlers/teapot.ts b/evals/corpus/handlers/teapot.ts new file mode 100644 index 0000000..062793f --- /dev/null +++ b/evals/corpus/handlers/teapot.ts @@ -0,0 +1,3 @@ +export function handleTeapot(): { status: number; body: string } { + return { status: 418, body: "teapot" }; +} diff --git a/evals/corpus/handlers/version.ts b/evals/corpus/handlers/version.ts new file mode 100644 index 0000000..b62d21e --- /dev/null +++ b/evals/corpus/handlers/version.ts @@ -0,0 +1,3 @@ +export function handleVersion(): { status: number; body: string } { + return { status: 200, body: "v1" }; +} diff --git a/evals/corpus/migrate/api.ts b/evals/corpus/migrate/api.ts new file mode 100644 index 0000000..5ead191 --- /dev/null +++ b/evals/corpus/migrate/api.ts @@ -0,0 +1,7 @@ +export function oldApi(payload: string): string { + return payload; +} + +export function newApi(payload: string, tier: string): string { + return `${tier}:${payload}`; +} diff --git a/evals/corpus/migrate/migrate.spec.md b/evals/corpus/migrate/migrate.spec.md new file mode 100644 index 0000000..78f69d4 --- /dev/null +++ b/evals/corpus/migrate/migrate.spec.md @@ -0,0 +1,19 @@ +--- +id: migrate +title: Migrate every service from oldApi to newApi (per-file tier) +verify: bun test +mode: existing +--- + +## Acceptance criteria + +A1. Every `svc.ts` currently calls the deprecated `oldApi(payload)`. Migrate each to `newApi(payload, tier)`, where `tier` is the string from that file's `// tier: ` header comment (e.g. `svc1.ts` is `// tier: gold` → `newApi("ping", "gold")`). The tier differs per file, so each edit is distinct — you must read each file to know its tier. + +A2. Import `newApi` from `./api` and remove the now-unused `oldApi` import (the gate forbids unused imports). Do not change `api.ts` or the payload string. + +## Tasks + +1. [migrate] Migrate all eight services to newApi with their per-file tier + accept: bun test migrate.test.ts + files: svc1.ts, svc2.ts, svc3.ts, svc4.ts, svc5.ts, svc6.ts, svc7.ts, svc8.ts + context: migrate.test.ts, api.ts diff --git a/evals/corpus/migrate/migrate.test.ts b/evals/corpus/migrate/migrate.test.ts new file mode 100644 index 0000000..9fce63f --- /dev/null +++ b/evals/corpus/migrate/migrate.test.ts @@ -0,0 +1,26 @@ +import { test, expect } from "bun:test"; +import { run as r1 } from "./svc1"; +import { run as r2 } from "./svc2"; +import { run as r3 } from "./svc3"; +import { run as r4 } from "./svc4"; +import { run as r5 } from "./svc5"; +import { run as r6 } from "./svc6"; +import { run as r7 } from "./svc7"; +import { run as r8 } from "./svc8"; + +const cases: ReadonlyArray<[() => string, string]> = [ + [r1, "gold:ping"], + [r2, "silver:ping"], + [r3, "bronze:ping"], + [r4, "platinum:ping"], + [r5, "diamond:ping"], + [r6, "copper:ping"], + [r7, "iron:ping"], + [r8, "steel:ping"], +]; + +for (const [run, expected] of cases) { + test(`migrated service returns ${expected}`, () => { + expect(run()).toBe(expected); + }); +} diff --git a/evals/corpus/migrate/svc1.ts b/evals/corpus/migrate/svc1.ts new file mode 100644 index 0000000..80a621d --- /dev/null +++ b/evals/corpus/migrate/svc1.ts @@ -0,0 +1,6 @@ +// tier: gold +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc2.ts b/evals/corpus/migrate/svc2.ts new file mode 100644 index 0000000..568a0c7 --- /dev/null +++ b/evals/corpus/migrate/svc2.ts @@ -0,0 +1,6 @@ +// tier: silver +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc3.ts b/evals/corpus/migrate/svc3.ts new file mode 100644 index 0000000..7d6e62b --- /dev/null +++ b/evals/corpus/migrate/svc3.ts @@ -0,0 +1,6 @@ +// tier: bronze +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc4.ts b/evals/corpus/migrate/svc4.ts new file mode 100644 index 0000000..283f58b --- /dev/null +++ b/evals/corpus/migrate/svc4.ts @@ -0,0 +1,6 @@ +// tier: platinum +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc5.ts b/evals/corpus/migrate/svc5.ts new file mode 100644 index 0000000..8f2330e --- /dev/null +++ b/evals/corpus/migrate/svc5.ts @@ -0,0 +1,6 @@ +// tier: diamond +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc6.ts b/evals/corpus/migrate/svc6.ts new file mode 100644 index 0000000..62be1c4 --- /dev/null +++ b/evals/corpus/migrate/svc6.ts @@ -0,0 +1,6 @@ +// tier: copper +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc7.ts b/evals/corpus/migrate/svc7.ts new file mode 100644 index 0000000..4e01576 --- /dev/null +++ b/evals/corpus/migrate/svc7.ts @@ -0,0 +1,6 @@ +// tier: iron +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/migrate/svc8.ts b/evals/corpus/migrate/svc8.ts new file mode 100644 index 0000000..8b3405b --- /dev/null +++ b/evals/corpus/migrate/svc8.ts @@ -0,0 +1,6 @@ +// tier: steel +import { oldApi } from "./api"; + +export function run(): string { + return oldApi("ping"); +} diff --git a/evals/corpus/validators/email.ts b/evals/corpus/validators/email.ts new file mode 100644 index 0000000..ee37f68 --- /dev/null +++ b/evals/corpus/validators/email.ts @@ -0,0 +1,3 @@ +export function isEmail(v: string): boolean { + return /^[^@\s]+@[^@\s]+\.[^@\s]+$/u.test(v); +} diff --git a/evals/corpus/validators/hexColor.ts b/evals/corpus/validators/hexColor.ts new file mode 100644 index 0000000..c7fb066 --- /dev/null +++ b/evals/corpus/validators/hexColor.ts @@ -0,0 +1,3 @@ +export function isHexColor(v: string): boolean { + return /^#?[0-9a-f]{6}$/iu.test(v); +} diff --git a/evals/corpus/validators/nonEmpty.ts b/evals/corpus/validators/nonEmpty.ts new file mode 100644 index 0000000..6b37c8b --- /dev/null +++ b/evals/corpus/validators/nonEmpty.ts @@ -0,0 +1,3 @@ +export function isNonEmpty(v: string): boolean { + return v.trim().length > 0; +} diff --git a/evals/corpus/validators/positive.ts b/evals/corpus/validators/positive.ts new file mode 100644 index 0000000..0849525 --- /dev/null +++ b/evals/corpus/validators/positive.ts @@ -0,0 +1,5 @@ +export function isPositive(v: string): boolean { + const n = Number(v); + + return Number.isFinite(n) && n > 0; +} diff --git a/evals/corpus/validators/slug.ts b/evals/corpus/validators/slug.ts new file mode 100644 index 0000000..15a8f7a --- /dev/null +++ b/evals/corpus/validators/slug.ts @@ -0,0 +1,3 @@ +export function isSlug(v: string): boolean { + return /^[a-z0-9]+(?:-[a-z0-9]+)*$/u.test(v); +} diff --git a/evals/corpus/validators/uuid.ts b/evals/corpus/validators/uuid.ts new file mode 100644 index 0000000..6c3a159 --- /dev/null +++ b/evals/corpus/validators/uuid.ts @@ -0,0 +1,3 @@ +export function isUuid(v: string): boolean { + return /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/iu.test(v); +} diff --git a/evals/corpus/validators/validators.spec.md b/evals/corpus/validators/validators.spec.md new file mode 100644 index 0000000..fed58b4 --- /dev/null +++ b/evals/corpus/validators/validators.spec.md @@ -0,0 +1,24 @@ +--- +id: validators +title: Field validators (one predicate module per rule) +verify: bun test +mode: scratch +--- + +## Acceptance criteria + +Each rule lives in its own module exporting a single predicate `(v: string) => boolean`. All six share the same shape — only the rule differs. + +A1. `nonEmpty.ts` → `isNonEmpty`: true iff `v` has at least one non-whitespace character. +A2. `positive.ts` → `isPositive`: true iff `v` parses to a finite number greater than 0. +A3. `email.ts` → `isEmail`: true iff `v` looks like `local@domain.tld` (non-empty local, domain, and TLD; no spaces or stray `@`). +A4. `slug.ts` → `isSlug`: true iff `v` is lowercase alphanumeric words joined by single hyphens (e.g. `my-post-1`), no leading/trailing/double hyphens, no spaces or uppercase. +A5. `hexColor.ts` → `isHexColor`: true iff `v` is a 6-digit hex color, optional leading `#` (e.g. `#a1b2c3` or `a1b2c3`), case-insensitive. +A6. `uuid.ts` → `isUuid`: true iff `v` is a canonical 8-4-4-4-12 hex UUID. + +## Tasks + +1. [validators] Create the six predicate modules + accept: bun test validators.test.ts + files: nonEmpty.ts, positive.ts, email.ts, slug.ts, hexColor.ts, uuid.ts + context: validators.test.ts diff --git a/evals/corpus/validators/validators.test.ts b/evals/corpus/validators/validators.test.ts new file mode 100644 index 0000000..0ba4651 --- /dev/null +++ b/evals/corpus/validators/validators.test.ts @@ -0,0 +1,37 @@ +import { test, expect } from "bun:test"; +import { isNonEmpty } from "./nonEmpty"; +import { isPositive } from "./positive"; +import { isEmail } from "./email"; +import { isSlug } from "./slug"; +import { isHexColor } from "./hexColor"; +import { isUuid } from "./uuid"; + +const valid: ReadonlyArray<[string, (v: string) => boolean, string]> = [ + ["nonEmpty", isNonEmpty, "x"], + ["positive", isPositive, "3"], + ["email", isEmail, "a@b.co"], + ["slug", isSlug, "my-post-1"], + ["hexColor", isHexColor, "#a1b2c3"], + ["uuid", isUuid, "123e4567-e89b-12d3-a456-426614174000"], +]; + +const invalid: ReadonlyArray<[string, (v: string) => boolean, string]> = [ + ["nonEmpty", isNonEmpty, " "], + ["positive", isPositive, "-2"], + ["email", isEmail, "nope"], + ["slug", isSlug, "Not A Slug"], + ["hexColor", isHexColor, "#zzz"], + ["uuid", isUuid, "123"], +]; + +for (const [name, fn, ok] of valid) { + test(`${name} accepts a valid value`, () => { + expect(fn(ok)).toBe(true); + }); +} + +for (const [name, fn, bad] of invalid) { + test(`${name} rejects an invalid value`, () => { + expect(fn(bad)).toBe(false); + }); +} diff --git a/packages/core/scripts/sweep.ts b/packages/core/scripts/sweep.ts index 2bccc5c..0ad6ac2 100644 --- a/packages/core/scripts/sweep.ts +++ b/packages/core/scripts/sweep.ts @@ -82,6 +82,7 @@ const DIM_ENV: Record = { hashline: "TSFORGE_HASHLINE", lsp_write_feedback: "TSFORGE_LSP_WRITE_FEEDBACK", simplicity: "TSFORGE_SIMPLICITY", + web: "TSFORGE_WEB", }; /** Map feature variant to env vars. Most dims set their var to the state; `git` @@ -96,6 +97,14 @@ function variantToEnvVars(variant: IFeatureVariant): Record { continue; } + // `script` is default-ON; like `git` it gates a NO_ flag, so script=on → + // the tool is available (NO_SCRIPT unset), script=off → withheld. + if (dim === "script") { + envVars.TSFORGE_NO_SCRIPT = state === "1" ? "0" : "1"; + + continue; + } + const varName = DIM_ENV[dim]; if (varName !== undefined) { diff --git a/packages/core/src/agent/agent.constants.ts b/packages/core/src/agent/agent.constants.ts index 9a96fca..1fbfc51 100644 --- a/packages/core/src/agent/agent.constants.ts +++ b/packages/core/src/agent/agent.constants.ts @@ -1,3 +1,5 @@ +import type { ToolName } from "./agent.types"; + /** * The canonical tool names. Schemas, dispatch, and any name comparison reference * these — never a bare string literal (so a rename is one edit and typos can't @@ -28,31 +30,84 @@ export const TOOL_NAME = { webFetch: "web_fetch", webSearch: "web_search", webBrowse: "web_browse", + script: "script", yieldStatus: "yield_status", } as const; -/** Tools that cannot mutate the workspace — the PLAN-MODE set. `run` is absent - * on purpose: it is special-cased (allowed only for read-only commands — see - * isReadOnlyCommand in loop/tools/file-ops). */ -export const READ_ONLY_TOOL_NAMES: ReadonlySet = new Set([ - TOOL_NAME.read, - TOOL_NAME.search, - TOOL_NAME.symbolSearch, - TOOL_NAME.findReferences, - TOOL_NAME.typeAt, - TOOL_NAME.diagnostics, +/** Per-tool capability flags — the single source of truth the plan-mode set and + * the script-exposable subset are derived from (so a new tool declares its + * behaviour ONCE here instead of being added to several hand-kept sets). + * - `readOnly`: cannot mutate the workspace ⇒ allowed in plan mode. `run` is + * deliberately false: it is special-cased (allowed only for read-only + * commands — see isReadOnlyCommand in loop/tools/file-ops). + * - `scriptExposable`: safe + useful to call from inside a `script` program via + * the generated RPC stubs. Excludes the heavy/interactive scaffolds, the + * dependency installer, the turn-ending yield, and `script` itself (no + * recursion). Mutating tools (edit/create/…) ARE exposable — they still flow + * back through executeTool's scope + write-guard + gate. */ +export interface IToolSpec { + readOnly: boolean; + scriptExposable: boolean; +} + +export const TOOL_SPECS: Readonly> = { + [TOOL_NAME.read]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.run]: { readOnly: false, scriptExposable: true }, + [TOOL_NAME.edit]: { readOnly: false, scriptExposable: true }, + [TOOL_NAME.editLines]: { readOnly: false, scriptExposable: true }, + [TOOL_NAME.create]: { readOnly: false, scriptExposable: true }, + [TOOL_NAME.search]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.symbolSearch]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.findReferences]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.typeAt]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.diagnostics]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.renameSymbol]: { readOnly: false, scriptExposable: true }, + [TOOL_NAME.moveFile]: { readOnly: false, scriptExposable: true }, + [TOOL_NAME.organizeImports]: { readOnly: false, scriptExposable: true }, // git_context only inspects history/diffs — no workspace mutation — so it is a // plan-mode tool too (scope a review/fix while planning, before any edit). - TOOL_NAME.gitContext, - TOOL_NAME.packageInfo, - TOOL_NAME.packageDocs, + [TOOL_NAME.gitContext]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.scaffoldUi]: { readOnly: false, scriptExposable: false }, + [TOOL_NAME.scaffoldRoutes]: { readOnly: false, scriptExposable: false }, + [TOOL_NAME.scaffoldWeb]: { readOnly: false, scriptExposable: false }, + [TOOL_NAME.addDependency]: { readOnly: false, scriptExposable: false }, + [TOOL_NAME.packageInfo]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.packageDocs]: { readOnly: true, scriptExposable: true }, // Web tools are read-only (no workspace mutation), so they're usable in plan // mode too — research while planning. Network egress here is structured and // opt-in (TSFORGE_WEB), unlike the raw `run` curl path plan mode blocks. - TOOL_NAME.webFetch, - TOOL_NAME.webSearch, - TOOL_NAME.webBrowse, -]); + [TOOL_NAME.webFetch]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.webSearch]: { readOnly: true, scriptExposable: true }, + [TOOL_NAME.webBrowse]: { readOnly: true, scriptExposable: true }, + // `script` mutates (it can call edit/create) and must never call itself. + [TOOL_NAME.script]: { readOnly: false, scriptExposable: false }, + [TOOL_NAME.yieldStatus]: { readOnly: false, scriptExposable: false }, +}; + +function toolNamesWhere( + pick: (spec: IToolSpec) => boolean +): ReadonlySet { + const names = new Set(); + + for (const [name, spec] of Object.entries(TOOL_SPECS)) { + if (pick(spec)) { + names.add(name); + } + } + + return names; +} + +/** Tools that cannot mutate the workspace — the PLAN-MODE set (derived from + * TOOL_SPECS). `run` is absent on purpose (special-cased; see above). */ +export const READ_ONLY_TOOL_NAMES: ReadonlySet = toolNamesWhere( + (spec) => spec.readOnly +); + +/** Tools the model may call from inside a `script` program (derived). */ +export const SCRIPT_EXPOSABLE_TOOLS: ReadonlySet = toolNamesWhere( + (spec) => spec.scriptExposable +); /** The model's own decision to start a from-scratch WEB app: scaffolds the stack * (Vite + the chosen framework + deps) and switches the session to the web gate. @@ -370,6 +425,34 @@ export const PACKAGE_DOCS_TOOL = { }, }; +/** Programmatic Tool Calling: the model writes ONE TypeScript program that calls + * tools through generated stubs, collapsing a multi-step tool chain into a single + * turn. Opt-in (TSFORGE_SCRIPT) and withheld in plan mode (it can write). */ +export const SCRIPT_TOOL = { + type: "function", + function: { + name: TOOL_NAME.script, + description: + "Run ONE TypeScript program that calls tools via stubs imported from `./tsforge-tools`, instead of many separate tool turns. Best for repetitive multi-step work — read/scan many files, fetch+compare several packages, transform-then-write across files. Each stub (e.g. `read`, `run`, `web_search`, `edit`, `create`) is async and returns the tool's text result; only your script's stdout (use console.log) comes back to you. File changes MUST go through the `edit`/`create` stubs (not direct fs writes) so they pass the scope + type/lint gate. Bounded by a wall-clock timeout and a tool-call cap.", + parameters: { + type: "object", + properties: { + code: { + type: "string", + description: + "the TypeScript program; `import { read, run, web_search, edit, create } from './tsforge-tools'` and console.log what you want returned", + }, + timeoutMs: { + type: "number", + description: + "optional wall-clock budget in ms (default 60000, max 300000)", + }, + }, + required: ["code"], + }, + }, +}; + /** * Semantic + search tools backed by the in-process TypeScript LanguageService * (+ ripgrep). Read-only tools (find_references, type_at, symbol_search, diff --git a/packages/core/src/config/config.constants.ts b/packages/core/src/config/config.constants.ts index c9cb108..160b624 100644 --- a/packages/core/src/config/config.constants.ts +++ b/packages/core/src/config/config.constants.ts @@ -9,6 +9,7 @@ export const ENV_FLAG = { simplicity: "TSFORGE_SIMPLICITY", tdd: "TSFORGE_TDD", webTools: "TSFORGE_WEB", + noScriptTool: "TSFORGE_NO_SCRIPT", noUpdateCheck: "TSFORGE_NO_UPDATE_CHECK", noGitTool: "TSFORGE_NO_GIT_TOOL", } as const; diff --git a/packages/core/src/config/flags.ts b/packages/core/src/config/flags.ts index 804e3b7..e8b3bfb 100644 --- a/packages/core/src/config/flags.ts +++ b/packages/core/src/config/flags.ts @@ -45,6 +45,11 @@ export const flags = { * web_fetch extracts locally; web_search uses DuckDuckGo (or a self-hosted * SearXNG via TSFORGE_SEARXNG_URL). */ webTools: (): boolean => isOn(ENV_FLAG.webTools), + /** Programmatic Tool Calling: advertise the `script` tool. ON by default — it + * measurably speeds up read-dependent multi-file work (codemods) and is a no-op + * on simple tasks; withhold with TSFORGE_NO_SCRIPT (the A/B / kill switch). It + * makes no network calls, so default-on keeps eval sweeps deterministic. */ + scriptTool: (): boolean => !isOn(ENV_FLAG.noScriptTool), /** Disable the startup "update available" npm-registry check (default ON, i.e. * the check runs only in interactive non-CI sessions). Set to "1" for offline * environments or to silence the notice. */ diff --git a/packages/core/src/loop/prompt/prompt.ts b/packages/core/src/loop/prompt/prompt.ts index 8511d03..4f9b73b 100644 --- a/packages/core/src/loop/prompt/prompt.ts +++ b/packages/core/src/loop/prompt/prompt.ts @@ -56,6 +56,24 @@ export function buildWebResearchGuidance(): string { ].join("\n"); } +export function buildScriptToolGuidance(): string { + return [ + "SCRIPT — one program for work where you must READ each file to compute its change:", + " • Reach for `script` ONLY when the change to many (≈5+) files DEPENDS on first", + " reading each file — e.g. update a call in every file using a value declared in", + " that same file. Normally that's a read turn THEN an edit turn (the contents", + " flood your context); a script does read→edit per file in ONE loop, one turn,", + " and only its `console.log` returns.", + " • `import { read, edit, create, run } from './tsforge-tools'` — each stub is", + " async and returns the tool's text result. Log a short summary, not the files.", + " • Edits/creates MUST go through the `edit`/`create` stubs (NOT `node:fs`/", + " `Bun.write`) so they still pass scope + the type/lint gate.", + " • Do NOT use it when you can already act in one turn WITHOUT reading first —", + " creating several files from the spec, or a single edit. Emitting those tool", + " calls directly is simpler and no slower. It cannot call `script` itself.", + ].join("\n"); +} + /** Appended to SYSTEM for from-scratch, NON-web utility builds when the simplicity * flag is on. Pushes the model toward the shortest correct solution — the axis the * gate is blind to (it checks correctness, never concision). Carve-outs keep it @@ -123,6 +141,10 @@ export function buildSystemPrompt( blocks.push(buildWebResearchGuidance()); } + if (flags.scriptTool()) { + blocks.push(buildScriptToolGuidance()); + } + // Simplicity: from-scratch, non-web only (an A/B-gated concision push). if (flags.simplicity() && !hasExistingCode && !webish) { blocks.push(buildScratchSimplicityGuidance(conventions)); @@ -164,6 +186,12 @@ export function buildChatSystem(conventions: IConventions): string { ); } + if (flags.scriptTool()) { + lines.push( + "The `script` tool is enabled: for repetitive multi-step tool work (scan many files, fetch+compare several packages, transform-then-write), write ONE TypeScript program importing stubs from `./tsforge-tools` instead of many tool turns; route file changes through the `edit`/`create` stubs." + ); + } + return lines.join("\n"); } diff --git a/packages/core/src/loop/tools/execute-tool.ts b/packages/core/src/loop/tools/execute-tool.ts index 20ad146..eaad684 100644 --- a/packages/core/src/loop/tools/execute-tool.ts +++ b/packages/core/src/loop/tools/execute-tool.ts @@ -12,6 +12,7 @@ import { doWebFetch } from "./web-fetch"; import { doWebSearch } from "./web-search"; import { doWebBrowse } from "./web-browse"; import { doPackageInfo, doPackageDocs } from "./package-info"; +import { doScript } from "./script-tool"; import { reject, type IToolContext } from "./tool-context"; import { classifyAction, @@ -52,6 +53,10 @@ const HANDLERS: Record = { [TOOL_NAME.webFetch]: doWebFetch, [TOOL_NAME.webSearch]: doWebSearch, [TOOL_NAME.webBrowse]: doWebBrowse, + // The script's stubs RPC back into executeTool — passed as `execute` here so + // script-tool.ts never imports this module (no cycle), and a nested `script` + // call is rejected (script is not in SCRIPT_EXPOSABLE_TOOLS). + [TOOL_NAME.script]: (a, c) => doScript(a, c, { execute: executeTool }), // yield_status is intercepted by the Session BEFORE tool dispatch (it ends the // turn); this handler only fires if one slips through with other calls. [TOOL_NAME.yieldStatus]: () => diff --git a/packages/core/src/loop/tools/script-tool.ts b/packages/core/src/loop/tools/script-tool.ts new file mode 100644 index 0000000..89867c3 --- /dev/null +++ b/packages/core/src/loop/tools/script-tool.ts @@ -0,0 +1,314 @@ +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { randomUUID } from "node:crypto"; +import type { IToolCall } from "../../inference"; +import { isRecord } from "../../lib/guards"; +import { SCRIPT_EXPOSABLE_TOOLS, TOOL_NAME } from "../../agent"; +import { LOOP_LIMITS } from "../loop.constants"; +import { condenseToolOutput } from "./condense"; +import { reject, str, type IToolContext } from "./tool-context"; + +/** The single tool-dispatch entry the RPC server calls per request. Injected from + * execute-tool.ts so script-tool.ts never imports executeTool (no import cycle). */ +export type ExecuteFn = (call: IToolCall, ctx: IToolContext) => Promise; + +export interface IScriptDeps { + execute: ExecuteFn; +} + +const DEFAULT_TIMEOUT_MS = 60_000; +const MAX_TIMEOUT_MS = 300_000; +const DEFAULT_MAX_CALLS = 50; + +function envInt(name: string, fallback: number, max: number): number { + const raw = process.env[name]; + const parsed = raw === undefined ? NaN : Number.parseInt(raw, 10); + + if (Number.isFinite(parsed) && parsed > 0) { + return Math.min(parsed, max); + } + + return fallback; +} + +function timeoutMs(args: Record): number { + const value = args.timeoutMs; + + if (typeof value === "number" && Number.isFinite(value) && value > 0) { + return Math.min(Math.floor(value), MAX_TIMEOUT_MS); + } + + return envInt( + "TSFORGE_SCRIPT_TIMEOUT_MS", + DEFAULT_TIMEOUT_MS, + MAX_TIMEOUT_MS + ); +} + +/** The stub module the script imports as `./tsforge-tools`. One async function per + * exposed tool, each POSTing `{tool,args}` to the loopback RPC server with the + * one-time token. Plain JS (no type annotations) so `bun run` needs no config. */ +export function generateToolStubs(exposed: readonly string[]): string { + const fns = exposed + .map( + (name) => + `export async function ${name}(args = {}) { return __call(${JSON.stringify(name)}, args); }` + ) + .join("\n"); + + return `// AUTO-GENERATED by tsforge — do not edit. Tool stubs for the script sandbox. +const __URL = process.env.TSFORGE_RPC_URL; +const __TOKEN = process.env.TSFORGE_RPC_TOKEN; + +async function __call(tool, args) { + const res = await fetch(__URL, { + method: "POST", + headers: { "content-type": "application/json", "x-tsforge-token": __TOKEN }, + body: JSON.stringify({ tool, args: args ?? {} }), + }); + let data; + try { + data = await res.json(); + } catch { + throw new Error(\`tsforge tool \${tool}: bad RPC response\`); + } + if (!res.ok || typeof data?.error === "string") { + throw new Error(data?.error ?? \`tsforge tool \${tool} failed\`); + } + return data.result; +} + +${fns} +`; +} + +interface IRpcOutcome { + result?: string; + error?: string; +} + +/** Run `fn`s strictly one at a time (single-slot chain) so concurrent script tool + * calls never interleave a mutation through the shared tool context. */ +function makeSerializer(): (fn: () => Promise) => Promise { + let tail: Promise = Promise.resolve(); + + return (fn: () => Promise): Promise => { + const run = tail.then(fn, fn); + + tail = run.then( + () => undefined, + () => undefined + ); + + return run; + }; +} + +interface IScriptServer { + url: string; + token: string; + stop: () => void; + callCount: () => number; +} + +function startRpcServer(ctx: IToolContext, deps: IScriptDeps): IScriptServer { + const token = randomUUID(); + const maxCalls = envInt("TSFORGE_SCRIPT_MAX_CALLS", DEFAULT_MAX_CALLS, 1_000); + const serialize = makeSerializer(); + let calls = 0; + + async function handle(req: Request): Promise { + if (req.headers.get("x-tsforge-token") !== token) { + return { error: "forbidden" }; + } + + const body: unknown = await req.json(); + const tool = + isRecord(body) && typeof body.tool === "string" ? body.tool : ""; + const rawArgs = isRecord(body) ? body.args : undefined; + const args = isRecord(rawArgs) ? rawArgs : {}; + + if (tool === TOOL_NAME.script || !SCRIPT_EXPOSABLE_TOOLS.has(tool)) { + return { error: `tool \`${tool}\` is not callable from a script` }; + } + + calls += 1; + + if (calls > maxCalls) { + return { + error: `script tool-call limit (${String(maxCalls)}) exceeded — do the rest in another turn`, + }; + } + + ctx.report({ + kind: "tool", + task: ctx.task, + message: `↳ script:${tool}`, + }); + + const result = await serialize(() => + deps.execute({ name: tool, arguments: args }, ctx) + ); + + return { result }; + } + + const server = Bun.serve({ + hostname: "127.0.0.1", + port: 0, + async fetch(req) { + const outcome = await handle(req).catch((err: unknown) => ({ + error: err instanceof Error ? err.message : "rpc error", + })); + const status = outcome.error === "forbidden" ? 403 : 200; + + return Response.json(outcome, { status }); + }, + }); + + return { + url: `http://127.0.0.1:${String(server.port)}/call`, + token, + stop: () => { + void server.stop(true); + }, + callCount: () => calls, + }; +} + +interface IRunResult { + exitCode: number; + output: string; +} + +async function runScript( + scriptPath: string, + cwd: string, + env: Record, + budgetMs: number, + signal: AbortSignal | undefined +): Promise { + const proc = Bun.spawn(["bun", "run", scriptPath], { + cwd, + env, + stdout: "pipe", + stderr: "pipe", + }); + // Holder (not a bare `let`) so the union type survives — a plain variable + // assigned only inside the timer/abort closures gets narrowed to `null` by + // control-flow analysis, which would dead-flag the `note` comparisons below. + const kill: { reason: "timeout" | "abort" | null } = { reason: null }; + const timer = setTimeout(() => { + kill.reason = "timeout"; + proc.kill("SIGKILL"); + }, budgetMs); + + const onAbort = (): void => { + kill.reason = "abort"; + proc.kill("SIGKILL"); + }; + + signal?.addEventListener("abort", onAbort, { once: true }); + + try { + const [stdout, stderr] = await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + ]); + + await proc.exited; + + const note = + kill.reason === "timeout" + ? `\n[script killed: exceeded ${String(budgetMs)}ms timeout]` + : kill.reason === "abort" + ? "\n[script aborted]" + : ""; + + return { + exitCode: proc.exitCode ?? 1, + output: `${stdout}${stderr}${note}`, + }; + } finally { + clearTimeout(timer); + signal?.removeEventListener("abort", onAbort); + } +} + +/** + * `script` — run one TypeScript program that calls tools through generated RPC + * stubs, collapsing a multi-step tool chain into a SINGLE model turn. Every stub + * call routes back through `executeTool` (deps.execute), so scope, the unified + * policy, the write-guard, mutation accounting, and the gate all still apply — the + * model gets ergonomics, not new powers. Only the script's stdout returns to the + * model. Bounded by a wall-clock timeout, a per-script tool-call cap, and output + * condensing. NOT offered in plan mode (it can write), and it cannot call itself. + */ +export async function doScript( + args: Record, + ctx: IToolContext, + deps: IScriptDeps +): Promise { + const code = str(args, "code").trim(); + + if (code.length === 0) { + return reject( + ctx, + "script", + "script: `code` must be a non-empty TypeScript program (import tools from `./tsforge-tools`)." + ); + } + + ctx.report({ kind: "tool", task: ctx.task, message: "↳ script" }); + + // Inside the workspace (hidden, dot-prefixed) — NOT the system tmp dir — so the + // script resolves the project's `node_modules` and relative imports by walking + // up from here to ctx.cwd. The dot prefix keeps eslint/tsc from picking it up, + // and it is removed in `finally`. Server starts INSIDE the try so a throw there + // can't leak the dir. + const dir = await mkdtemp(join(ctx.cwd, ".tsforge-script-")); + let server: IScriptServer | undefined; + + try { + server = startRpcServer(ctx, deps); + await writeFile( + join(dir, "tsforge-tools.ts"), + generateToolStubs([...SCRIPT_EXPOSABLE_TOOLS]) + ); + + const scriptPath = join(dir, "script.ts"); + + await writeFile(scriptPath, code); + + const { exitCode, output } = await runScript( + scriptPath, + ctx.cwd, + { + ...process.env, + TSFORGE_RPC_URL: server.url, + TSFORGE_RPC_TOKEN: server.token, + }, + timeoutMs(args), + ctx.signal + ); + + const { text } = condenseToolOutput( + { command: "script", output, exitCode }, + LOOP_LIMITS.maxToolOutputChars + ); + + ctx.report({ + kind: "run", + task: ctx.task, + message: "$ script", + command: "script", + exitCode, + output: text, + }); + + return `script exit ${String(exitCode)} (${String(server.callCount())} tool call${server.callCount() === 1 ? "" : "s"})\n${text}`; + } finally { + server?.stop(); + await rm(dir, { recursive: true, force: true }); + } +} diff --git a/packages/core/src/loop/turn.ts b/packages/core/src/loop/turn.ts index 700ac3d..4c13b25 100644 --- a/packages/core/src/loop/turn.ts +++ b/packages/core/src/loop/turn.ts @@ -35,6 +35,7 @@ import { WEB_BROWSE_TOOL, PACKAGE_INFO_TOOL, PACKAGE_DOCS_TOOL, + SCRIPT_TOOL, GIT_CONTEXT_TOOL, } from "../agent"; import { TsService } from "../lsp"; @@ -83,6 +84,7 @@ type AdvertisedTool = | typeof WEB_BROWSE_TOOL | typeof PACKAGE_INFO_TOOL | typeof PACKAGE_DOCS_TOOL + | typeof SCRIPT_TOOL | typeof GIT_CONTEXT_TOOL; /** Free, local web tools (fetch + search) — advertised only under TSFORGE_WEB so @@ -107,16 +109,31 @@ function gitTools(hasExistingCode: boolean): AdvertisedTool[] { return hasExistingCode && !flags.noGitTool() ? [GIT_CONTEXT_TOOL] : []; } +/** Programmatic Tool Calling — ON by default (withheld under TSFORGE_NO_SCRIPT). + * Available on both scratch and existing-code runs; the plan-mode path rejects it + * at dispatch (it's a mutating tool), so a script can't write while planning. */ +function scriptTools(): AdvertisedTool[] { + return flags.scriptTool() ? [SCRIPT_TOOL] : []; +} + export function toolsFor(hasExistingCode: boolean): AdvertisedTool[] { const web = webTools(); const git = gitTools(hasExistingCode); + const script = scriptTools(); if (flags.noLspTools() || !hasExistingCode) { - return [...BASE_TOOLS, ...HASHLINE_TOOLS, ...web, ...git]; + return [...BASE_TOOLS, ...HASHLINE_TOOLS, ...web, ...git, ...script]; } - // existing-code: base + LSP nav + (gated) web + (gated) git. - return [...BASE_TOOLS, ...HASHLINE_TOOLS, ...LSP_TOOLS, ...web, ...git]; + // existing-code: base + LSP nav + (gated) web + (gated) git + (gated) script. + return [ + ...BASE_TOOLS, + ...HASHLINE_TOOLS, + ...LSP_TOOLS, + ...web, + ...git, + ...script, + ]; } /** The model wrote prose but issued NO tool call while the gate is still red — @@ -291,8 +308,12 @@ export async function runToolCalls( // paths resolved). Scope-checking the raw tool arg here instead would miss a // write the handler normalized into scope, skipping the gate. The event fires // only on a successful write, so failures/rejects never count. See P1/P2. - // (Object ref, not a captured `let`: CFA de-narrows a property after a call.) - const wrote = { value: false, path: "" }; + // EVERY in-scope file written during this tool call — a Set, not a single + // path, because ONE call can write MANY files: the `script` tool runs a + // program whose edit/create stubs each report a write through this same + // callback. Tracking only the last path would skip the write-guard + touched + // (and thus change-scoped rules like test-sibling-required) for the rest. + const wrote = new Set(); // Files mutated by a tool the model did NOT hand-write (semantic ops, // scaffolds). These re-gate and join the change scope but skip the write-guard. const mutated: string[] = []; @@ -303,8 +324,7 @@ export async function runToolCalls( event.file !== undefined && isInScope(event.file, ctx.task.files) ) { - wrote.value = true; - wrote.path = event.file; + wrote.add(event.file); } if (event.mutated !== undefined) { @@ -322,13 +342,18 @@ export async function runToolCalls( let feedback = ""; - if (wrote.value) { + if (wrote.size > 0) { touchedEditable = true; - state.edits += 1; - // Record what the agent wrote so change-scoped gate rules (test-sibling- - // required) know which files to enforce on, then write-guard just this file. - recordTouched(ctx, [wrote.path]); - feedback = await runWriteGuard(ctx, wrote.path); + state.edits += wrote.size; + const written = [...wrote]; + + // Record EVERY file written so change-scoped gate rules (test-sibling- + // required) enforce on all of them, then write-guard each. + recordTouched(ctx, written); + + for (const path of written) { + feedback += await runWriteGuard(ctx, path); + } } // A tool that mutated files without the model hand-writing them (a successful diff --git a/packages/core/src/policy/classify.ts b/packages/core/src/policy/classify.ts index 457362f..e17e6be 100644 --- a/packages/core/src/policy/classify.ts +++ b/packages/core/src/policy/classify.ts @@ -28,6 +28,9 @@ const KIND_BY_TOOL: Readonly> = { [TOOL_NAME.scaffoldWeb]: "write_file", [TOOL_NAME.run]: "shell", [TOOL_NAME.addDependency]: "shell", + // `script` runs a program that can call other tools — classify as shell so the + // policy treats it like `run` (its stub calls are each re-classified on dispatch). + [TOOL_NAME.script]: "shell", [TOOL_NAME.packageInfo]: "network", [TOOL_NAME.packageDocs]: "network", [TOOL_NAME.webFetch]: "network", diff --git a/packages/core/tests/script-tool.test.ts b/packages/core/tests/script-tool.test.ts new file mode 100644 index 0000000..c11a448 --- /dev/null +++ b/packages/core/tests/script-tool.test.ts @@ -0,0 +1,338 @@ +import { test, expect, afterAll } from "bun:test"; +import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"; +import { mkdtempSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + doScript, + generateToolStubs, + type ExecuteFn, +} from "../src/loop/tools/script-tool"; +import { executeTool } from "../src/loop/tools/execute-tool"; +import { READ_ONLY_TOOL_NAMES, SCRIPT_EXPOSABLE_TOOLS } from "../src/agent"; +import type { IToolContext } from "../src/loop/tools/tool-context"; +import type { ILoopEvent } from "../src/loop/loop.types"; + +// The script tool creates its temp dir inside ctx.cwd (so workspace module +// resolution works) — point the default cwd at an isolated temp dir, never the +// repo, so no `.tsforge-script-*` artifacts land here. +const TMP_CWD = mkdtempSync(join(tmpdir(), "tsforge-script-cwd-")); + +afterAll(async () => { + await rm(TMP_CWD, { recursive: true, force: true }); +}); + +interface ICtxOpts { + cwd?: string; + files?: string[]; + readOnly?: boolean; +} + +function makeCtx(opts: ICtxOpts, events: ILoopEvent[]): IToolContext { + return { + cwd: opts.cwd ?? TMP_CWD, + files: opts.files ?? [], + task: "t", + report: (e) => events.push(e), + ...(opts.readOnly === undefined ? {} : { readOnly: opts.readOnly }), + }; +} + +/** A fake tool dispatcher: records each call and returns a canned string. */ +function recordingExecute(calls: string[]): ExecuteFn { + return async (call) => { + calls.push(call.name); + + return `R:${call.name}`; + }; +} + +test("the script-exposable subset is the safe/useful tools, never script itself", () => { + // Useful read + mutation + research tools are reachable from a script… + for (const name of [ + "read", + "run", + "search", + "edit", + "create", + "web_search", + ]) { + expect(SCRIPT_EXPOSABLE_TOOLS.has(name)).toBe(true); + } + + // …but the heavy/interactive + recursion-prone ones are NOT. + for (const name of [ + "script", + "scaffold_web", + "add_dependency", + "yield_status", + ]) { + expect(SCRIPT_EXPOSABLE_TOOLS.has(name)).toBe(false); + } + + // Plan-mode (read-only) set stays mutation-free and excludes script. + expect(READ_ONLY_TOOL_NAMES.has("read")).toBe(true); + expect(READ_ONLY_TOOL_NAMES.has("web_fetch")).toBe(true); + expect(READ_ONLY_TOOL_NAMES.has("edit")).toBe(false); + expect(READ_ONLY_TOOL_NAMES.has("run")).toBe(false); + expect(READ_ONLY_TOOL_NAMES.has("script")).toBe(false); +}); + +test("generateToolStubs emits one async fn per tool plus the __call helper", () => { + const src = generateToolStubs(["read", "web_search"]); + + expect(src).toContain("export async function read(args = {})"); + expect(src).toContain("export async function web_search(args = {})"); + expect(src).toContain("async function __call(tool, args)"); + expect(src).toContain("x-tsforge-token"); + // No stub for `script` is ever generated (no recursion entry point). + expect(src).not.toContain("function script("); +}); + +test("a script collapses N tool calls into ONE turn and returns only stdout", async () => { + const calls: string[] = []; + const events: ILoopEvent[] = []; + const code = [ + 'import { read } from "./tsforge-tools";', + 'const a = await read({ file: "x" });', + 'const b = await read({ file: "y" });', + 'console.log("GOT", a, b);', + ].join("\n"); + + const out = await doScript({ code }, makeCtx({}, events), { + execute: recordingExecute(calls), + }); + + expect(calls).toEqual(["read", "read"]); + expect(out).toContain("2 tool calls"); + expect(out).toContain("GOT R:read R:read"); + // Each stub call is surfaced on the ledger for observability. + expect(events.filter((e) => e.message === "↳ script:read")).toHaveLength(2); +}); + +test("a script resolves the workspace's node_modules (temp dir lives in cwd)", async () => { + // The temp dir is created INSIDE ctx.cwd so module resolution walks up to the + // project's node_modules — a script can import a workspace dep, not just stubs. + const dir = await mkdtemp(join(tmpdir(), "tsforge-script-nm-")); + + try { + await mkdir(join(dir, "node_modules", "leftpad"), { recursive: true }); + await writeFile( + join(dir, "node_modules", "leftpad", "package.json"), + JSON.stringify({ name: "leftpad", version: "1.0.0", main: "index.js" }) + ); + await writeFile( + join(dir, "node_modules", "leftpad", "index.js"), + 'module.exports = { tag: () => "LEFTPAD_OK" };\n' + ); + + const events: ILoopEvent[] = []; + const code = ['import { tag } from "leftpad";', "console.log(tag());"].join( + "\n" + ); + + const out = await doScript({ code }, makeCtx({ cwd: dir }, events), { + execute: executeTool, + }); + + expect(out).toContain("LEFTPAD_OK"); + } finally { + await rm(dir, { recursive: true, force: true }); + } +}); + +test("an in-scope create through the stub lands through executeTool + reports", async () => { + const dir = await mkdtemp(join(tmpdir(), "tsforge-script-")); + + try { + const events: ILoopEvent[] = []; + const code = [ + 'import { create } from "./tsforge-tools";', + 'const r = await create({ file: "out.ts", content: "export const x = 1;\\n" });', + "console.log(r);", + ].join("\n"); + + const out = await doScript( + { code }, + makeCtx({ cwd: dir, files: ["out.ts"] }, events), + { + execute: executeTool, + } + ); + + expect(out).toContain("created out.ts"); + expect(await Bun.file(join(dir, "out.ts")).exists()).toBe(true); + expect(events.some((e) => e.kind === "create" && e.file === "out.ts")).toBe( + true + ); + } finally { + await rm(dir, { recursive: true, force: true }); + } +}); + +test("an out-of-scope create through the stub is rejected (scope inherited)", async () => { + const dir = await mkdtemp(join(tmpdir(), "tsforge-script-")); + + try { + const events: ILoopEvent[] = []; + const code = [ + 'import { create } from "./tsforge-tools";', + 'const r = await create({ file: "other.ts", content: "export const y = 2;\\n" });', + "console.log(r);", + ].join("\n"); + + const out = await doScript( + { code }, + makeCtx({ cwd: dir, files: ["allowed.ts"] }, events), + { execute: executeTool } + ); + + expect(out).toContain("REJECTED"); + expect(await Bun.file(join(dir, "other.ts")).exists()).toBe(false); + } finally { + await rm(dir, { recursive: true, force: true }); + } +}); + +test("plan mode rejects `script` at dispatch — no subprocess runs", async () => { + const calls: string[] = []; + const events: ILoopEvent[] = []; + + const out = await executeTool( + { name: "script", arguments: { code: 'console.log("SHOULD_NOT_RUN");' } }, + makeCtx({ readOnly: true }, events) + ); + + expect(out).toContain("plan mode"); + expect(out).not.toContain("SHOULD_NOT_RUN"); + expect(calls).toHaveLength(0); +}); + +test("the per-script tool-call cap is enforced", async () => { + const prev = process.env.TSFORGE_SCRIPT_MAX_CALLS; + + process.env.TSFORGE_SCRIPT_MAX_CALLS = "2"; + + try { + const events: ILoopEvent[] = []; + const code = [ + 'import { read } from "./tsforge-tools";', + "await read({});", + "await read({});", + "try {", + " await read({});", + "} catch (e) {", + ' console.log("CAUGHT", e.message);', + "}", + ].join("\n"); + + const out = await doScript({ code }, makeCtx({}, events), { + execute: recordingExecute([]), + }); + + expect(out).toContain("CAUGHT"); + expect(out).toContain("tool-call limit (2) exceeded"); + } finally { + if (prev === undefined) { + delete process.env.TSFORGE_SCRIPT_MAX_CALLS; + } else { + process.env.TSFORGE_SCRIPT_MAX_CALLS = prev; + } + } +}); + +test("a runaway script is killed at the wall-clock timeout", async () => { + const events: ILoopEvent[] = []; + const out = await doScript( + { code: "while (true) {}", timeoutMs: 400 }, + makeCtx({}, events), + { execute: recordingExecute([]) } + ); + + expect(out).toContain("killed: exceeded"); + expect(out).not.toContain("script exit 0"); +}); + +test("the RPC server rejects a request with a wrong token", async () => { + const events: ILoopEvent[] = []; + const code = [ + "const res = await fetch(process.env.TSFORGE_RPC_URL, {", + ' method: "POST",', + ' headers: { "content-type": "application/json", "x-tsforge-token": "WRONG" },', + ' body: JSON.stringify({ tool: "read", args: {} }),', + "});", + 'console.log("STATUS", res.status);', + ].join("\n"); + + const calls: string[] = []; + const out = await doScript({ code }, makeCtx({}, events), { + execute: recordingExecute(calls), + }); + + expect(out).toContain("STATUS 403"); + expect(calls).toHaveLength(0); +}); + +test("the RPC server refuses `script` (no recursion) and non-exposable tools", async () => { + const events: ILoopEvent[] = []; + const code = [ + "async function call(tool) {", + " const res = await fetch(process.env.TSFORGE_RPC_URL, {", + ' method: "POST",', + ' headers: { "content-type": "application/json", "x-tsforge-token": process.env.TSFORGE_RPC_TOKEN },', + " body: JSON.stringify({ tool, args: {} }),", + " });", + " return (await res.json()).error;", + "}", + 'console.log("SCRIPT:", await call("script"));', + 'console.log("SCAFFOLD:", await call("scaffold_web"));', + ].join("\n"); + + const calls: string[] = []; + const out = await doScript({ code }, makeCtx({}, events), { + execute: recordingExecute(calls), + }); + + expect(out).toContain("SCRIPT: tool `script` is not callable from a script"); + expect(out).toContain( + "SCAFFOLD: tool `scaffold_web` is not callable from a script" + ); + expect(calls).toHaveLength(0); +}); + +test("concurrent stub calls are serialized (no interleaved dispatch)", async () => { + const order: string[] = []; + let seq = 0; + + const execute: ExecuteFn = async () => { + const id = (seq += 1); + + order.push(`start:${String(id)}`); + await new Promise((r) => setTimeout(r, 15)); + order.push(`end:${String(id)}`); + + return `r${String(id)}`; + }; + + const events: ILoopEvent[] = []; + const code = [ + 'import { read } from "./tsforge-tools";', + "await Promise.all([read({}), read({})]);", + 'console.log("done");', + ].join("\n"); + + await doScript({ code }, makeCtx({}, events), { execute }); + + expect(order).toEqual(["start:1", "end:1", "start:2", "end:2"]); +}); + +test("empty `code` is rejected without spawning anything", async () => { + const calls: string[] = []; + const events: ILoopEvent[] = []; + const out = await doScript({ code: " " }, makeCtx({}, events), { + execute: recordingExecute(calls), + }); + + expect(out).toContain("must be a non-empty"); + expect(calls).toHaveLength(0); +}); diff --git a/packages/core/tests/tool-accounting.test.ts b/packages/core/tests/tool-accounting.test.ts index 58f15de..5612066 100644 --- a/packages/core/tests/tool-accounting.test.ts +++ b/packages/core/tests/tool-accounting.test.ts @@ -127,6 +127,46 @@ test("a successful in-scope create counts as one edit and re-gates", async () => } }); +// Critical (PR #50 review): ONE `script` call can write MANY files via its +// edit/create stubs. runToolCalls tracked a single `wrote.path` and overwrote it +// per event, so every file but the LAST skipped the write-guard AND `touched` +// (which drives change-scoped rules like test-sibling-required). All written +// files must be recorded + counted. +test("a script that writes several files records ALL of them in touched", async () => { + const dir = await mkdtemp(join(tmpdir(), "tsforge-acct-script-")); + + try { + const ctx: ILoopCtx = { ...ctxFor(dir, ["**/*"]), touched: new Set() }; + const state = freshState(); + const code = [ + 'import { create } from "./tsforge-tools";', + 'await create({ file: "a.ts", content: "export const a = 1;\\n" });', + 'await create({ file: "b.ts", content: "export const b = 2;\\n" });', + 'await create({ file: "c.ts", content: "export const c = 3;\\n" });', + 'console.log("done");', + ].join("\n"); + + const touched = await runToolCalls( + [{ name: TOOL_NAME.script, arguments: { code } }], + ctx, + state + ); + + expect(touched).toBe(true); + // All three writes counted (not just the last), and all three recorded. + expect(state.edits).toBe(3); + expect([...(ctx.touched ?? [])].sort()).toEqual(["a.ts", "b.ts", "c.ts"]); + expect(await Bun.file(join(dir, "a.ts")).exists()).toBe(true); + expect(await Bun.file(join(dir, "c.ts")).exists()).toBe(true); + // The per-run temp dir is cleaned up (no `.tsforge-script-*` left behind). + const leftovers = [...new Bun.Glob(".tsforge-script-*").scanSync(dir)]; + + expect(leftovers).toHaveLength(0); + } finally { + await rm(dir, { recursive: true, force: true }); + } +}); + // P2: a same-content edit (oldString === newString, or already-applied) writes // nothing. The handler must NOT emit an edit event for it, so it neither counts // toward `state.edits` nor re-gates — otherwise a no-op edit lets a green gate @@ -453,8 +493,14 @@ const MUTATING_TOOLS = new Set([ TOOL_NAME.addDependency, ]); // run = the model's raw shell (writes are its own, not scoped harness edits); -// yield_status = turn control, never touches the workspace. -const SPECIAL_TOOLS = new Set([TOOL_NAME.run, TOOL_NAME.yieldStatus]); +// yield_status = turn control, never touches the workspace; script = runs a +// program whose tool calls (incl. edit/create) re-enter executeTool and report +// their OWN mutations, so the script call itself accounts for nothing. +const SPECIAL_TOOLS = new Set([ + TOOL_NAME.run, + TOOL_NAME.yieldStatus, + TOOL_NAME.script, +]); test("every registered tool is classified read-only, mutating, or special", () => { for (const name of Object.values(TOOL_NAME)) { diff --git a/packages/core/tests/tools-gating.test.ts b/packages/core/tests/tools-gating.test.ts index 5c82c27..e7ac860 100644 --- a/packages/core/tests/tools-gating.test.ts +++ b/packages/core/tests/tools-gating.test.ts @@ -8,6 +8,7 @@ afterEach(() => { delete process.env.TSFORGE_NO_LSP_TOOLS; delete process.env.TSFORGE_WEB; delete process.env.TSFORGE_NO_GIT_TOOL; + delete process.env.TSFORGE_NO_SCRIPT; }); test("scratch (no existing code) gets only the base tools — no LSP nav set", () => { @@ -21,6 +22,7 @@ test("scratch (no existing code) gets only the base tools — no LSP nav set", ( "edit_lines", "read", "run", + "script", ]); }); @@ -51,6 +53,7 @@ test("TSFORGE_NO_LSP_TOOLS=1 forces base-only — but git_context survives", () "git_context", "read", "run", + "script", ]); }); @@ -92,3 +95,15 @@ test("web tools are available on scratch tasks too when enabled", () => { expect(n).toContain("package_info"); expect(n).toContain("package_docs"); }); + +test("the script tool is on by default for scratch and existing code", () => { + expect(names(toolsFor(true))).toContain("script"); + expect(names(toolsFor(false))).toContain("script"); +}); + +test("TSFORGE_NO_SCRIPT=1 withholds the script tool (kill switch)", () => { + process.env.TSFORGE_NO_SCRIPT = "1"; + + expect(names(toolsFor(true))).not.toContain("script"); + expect(names(toolsFor(false))).not.toContain("script"); +});