growthxai · bnchrch · Jun 25, 2026 · Jun 23, 2026 · Jun 25, 2026
diff --git a/.changeset/out-491-cli-catalog-scenario-resolution.md b/.changeset/out-491-cli-catalog-scenario-resolution.md
@@ -0,0 +1,5 @@
+---
+"@outputai/cli": minor
+---
+
+CLI `start`/`run`/`test`/`dataset generate` now resolve scenarios and route execution against `--catalog`/`OUTPUT_CATALOG_ID` instead of the API server's default catalog. This removes the ~30s scenario-resolution stall in worktrees where the default catalog has no worker polling it. `workflow test` and `workflow dataset generate` also gain a `--catalog` flag (env: `OUTPUT_CATALOG_ID`), matching `list`/`start`/`run`.
diff --git a/sdk/cli/src/commands/workflow/dataset/generate.spec.ts b/sdk/cli/src/commands/workflow/dataset/generate.spec.ts
@@ -0,0 +1,84 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+vi.mock( '#api/generated/api.js', () => ( {
+  postWorkflowRun: vi.fn()
+} ) );
+
+vi.mock( '#utils/scenario_resolver.js', () => ( {
+  resolveScenarioPath: vi.fn(),
+  getScenarioNotFoundMessage: vi.fn().mockReturnValue( 'not found' )
+} ) );
+
+vi.mock( '#utils/input_parser.js', () => ( {
+  parseInputFlag: vi.fn()
+} ) );
+
+vi.mock( '#services/datasets.js', () => ( {
+  writeDataset: vi.fn(),
+  resolveDefaultDatasetsDir: vi.fn().mockResolvedValue( '/datasets' ),
+  buildDataset: vi.fn().mockReturnValue( { name: 'basic' } ),
+  getExecutionTime: vi.fn().mockResolvedValue( 100 ),
+  extractDatasetName: vi.fn()
+} ) );
+
+describe( 'workflow dataset generate command', () => {
+  beforeEach( () => {
+    vi.clearAllMocks();
+    delete process.env.OUTPUT_CATALOG_ID;
+  } );
+
+  describe( 'command definition', () => {
+    it( 'binds the catalog flag to OUTPUT_CATALOG_ID', async () => {
+      const DatasetGenerate = ( await import( './generate.js' ) ).default;
+      expect( DatasetGenerate.flags ).toHaveProperty( 'catalog' );
+      expect( DatasetGenerate.flags.catalog.env ).toBe( 'OUTPUT_CATALOG_ID' );
+      expect( DatasetGenerate.flags.catalog.char ).toBe( 'c' );
+    } );
+  } );
+
+  describe( 'run()', () => {
+    const createCommand = async ( flagOverrides: Record<string, unknown> = {} ) => {
+      const DatasetGenerate = ( await import( './generate.js' ) ).default;
+      const { postWorkflowRun } = await import( '#api/generated/api.js' );
+      const { resolveScenarioPath } = await import( '#utils/scenario_resolver.js' );
+      const { parseInputFlag } = await import( '#utils/input_parser.js' );
+
+      const cmd = new DatasetGenerate( [ 'my_workflow' ], {} as any );
+      cmd.log = vi.fn();
+      cmd.error = vi.fn( () => {
+        throw new Error( 'error called' );
+      } ) as any;
+      ( cmd as any ).parse = vi.fn().mockResolvedValue( {
+        args: { workflowName: 'my_workflow', scenario: 'basic' },
+        flags: { catalog: undefined, trace: undefined, name: undefined, download: false, limit: 5, input: undefined, ...flagOverrides }
+      } );
+
+      return {
+        cmd,
+        postWorkflowRun: vi.mocked( postWorkflowRun ),
+        resolveScenarioPath: vi.mocked( resolveScenarioPath ),
+        parseInputFlag: vi.mocked( parseInputFlag )
+      };
+    };
+
+    it( 'resolves the scenario and runs the workflow against the resolved catalog', async () => {
+      const { cmd, postWorkflowRun, resolveScenarioPath, parseInputFlag } = await createCommand( { catalog: 'my-catalog' } );
+      resolveScenarioPath.mockResolvedValue( { found: true, path: '/scenarios/basic.json', searchedPaths: [] } );
+      parseInputFlag.mockResolvedValue( { foo: 'bar' } as any );
+      postWorkflowRun.mockResolvedValue( {
+        data: { workflowId: 'wf-1', output: { ok: true } },
+        status: 200,
+        headers: new Headers()
+      } as any );
+
+      await cmd.run();
+
+      expect( resolveScenarioPath ).toHaveBeenCalledWith( 'my_workflow', 'basic', undefined, undefined, 'my-catalog' );
+      expect( postWorkflowRun ).toHaveBeenCalledWith(
+        expect.objectContaining( { workflowName: 'my_workflow', catalog: 'my-catalog' } ),
+        expect.anything()
+      );
+    } );
+  } );
+} );
diff --git a/sdk/cli/src/commands/workflow/dataset/generate.ts b/sdk/cli/src/commands/workflow/dataset/generate.ts
@@ -37,6 +37,14 @@ export default class DatasetGenerate extends Command {
   };
 
   static override flags = {
+    catalog: Flags.string( {
+      char: 'c',
+      aliases: [ 'task-queue' ],
+      charAliases: [ 'q' ],
+      deprecateAliases: true,
+      description: 'Catalog name for workflow execution (defaults to OUTPUT_CATALOG_ID)',
+      env: 'OUTPUT_CATALOG_ID'
+    } ),
     trace: Flags.string( {
       char: 't',
       description: 'Path to a local trace file to extract dataset from',
@@ -84,20 +92,23 @@ export default class DatasetGenerate extends Command {
       args.workflowName,
       args.scenario,
       flags.input,
-      flags.name
+      flags.name,
+      flags.catalog
     );
   }
 
   private async generateFromScenario(
     workflowName: string,
     scenario: string | undefined,
     inputFlag: string | undefined,
-    nameOverride: string | undefined
+    nameOverride: string | undefined,
+    catalog: string | undefined
   ): Promise<void> {
     const resolvedInput = await this.resolveScenarioInput(
       workflowName,
       scenario,
-      inputFlag
+      inputFlag,
+      catalog
     );
 
     const datasetName = nameOverride ?? scenario ?? 'dataset';
@@ -106,7 +117,8 @@ export default class DatasetGenerate extends Command {
 
     const response = await postWorkflowRun( {
       workflowName,
-      input: resolvedInput
+      input: resolvedInput,
+      catalog
     }, {
       config: { timeout: 600000 }
     } );
@@ -199,7 +211,8 @@ export default class DatasetGenerate extends Command {
   private async resolveScenarioInput(
     workflowName: string,
     scenario: string | undefined,
-    inputFlag: string | undefined
+    inputFlag: string | undefined,
+    catalog: string | undefined
   ): Promise<unknown> {
     if ( inputFlag && scenario ) {
       return ux.error(
@@ -213,7 +226,7 @@ export default class DatasetGenerate extends Command {
     }
 
     if ( scenario ) {
-      const resolution = await resolveScenarioPath( workflowName, scenario );
+      const resolution = await resolveScenarioPath( workflowName, scenario, undefined, undefined, catalog );
       if ( !resolution.found ) {
         return ux.error(
           getScenarioNotFoundMessage( workflowName, scenario, resolution.searchedPaths ),

diff --git a/sdk/cli/src/commands/workflow/run.spec.ts b/sdk/cli/src/commands/workflow/run.spec.ts
@@ -77,6 +77,7 @@ describe( 'workflow run command', () => {
 
       await cmd.run();
 
+      expect( resolveInput ).toHaveBeenCalledWith( 'my_workflow', undefined, undefined, 'run', undefined );
       expect( postWorkflowRun ).toHaveBeenCalledTimes( 1 );
       expect( postWorkflowRun ).toHaveBeenCalledWith(
         { workflowName: 'my_workflow', input: { key: 'value' }, catalog: undefined },
@@ -86,6 +87,28 @@ describe( 'workflow run command', () => {
       expect( cmd.log ).toHaveBeenCalledWith( expect.stringMatching( /\n/ ) );
     } );
 
+    it( 'threads the resolved catalog to resolveInput and postWorkflowRun', async () => {
+      const { cmd, postWorkflowRun, resolveInput } = await createCommand();
+      ( cmd as any ).parse = vi.fn().mockResolvedValue( {
+        args: { workflowName: 'my_workflow', scenario: 'basic' },
+        flags: { input: undefined, catalog: 'my-catalog', format: 'text' }
+      } );
+      resolveInput.mockResolvedValue( { key: 'value' } );
+      postWorkflowRun.mockResolvedValue( {
+        data: { status: 'completed', result: {} },
+        status: 200,
+        headers: new Headers()
+      } as any );
+
+      await cmd.run();
+
+      expect( resolveInput ).toHaveBeenCalledWith( 'my_workflow', 'basic', undefined, 'run', 'my-catalog' );
+      expect( postWorkflowRun ).toHaveBeenCalledWith(
+        expect.objectContaining( { catalog: 'my-catalog' } ),
+        expect.anything()
+      );
+    } );
+
     it( 'retries when response has Retry-After and succeeds on second attempt', async () => {
       const { cmd, postWorkflowRun, resolveInput } = await createCommand();
       resolveInput.mockResolvedValue( {} );

diff --git a/sdk/cli/src/commands/workflow/run.ts b/sdk/cli/src/commands/workflow/run.ts
@@ -81,7 +81,7 @@ export default class WorkflowRun extends Command {
   async run(): Promise<WorkflowResultResponse> {
     const { args, flags } = await this.parse( WorkflowRun );
 
-    const input = await resolveInput( args.workflowName, args.scenario, flags.input, 'run' );
+    const input = await resolveInput( args.workflowName, args.scenario, flags.input, 'run', flags.catalog );
 
     this.log( `Executing workflow: ${args.workflowName}...` );
 

diff --git a/sdk/cli/src/commands/workflow/start.spec.ts b/sdk/cli/src/commands/workflow/start.spec.ts
@@ -1,13 +1,20 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { describe, it, expect, vi, beforeEach } from 'vitest';
 
-vi.mock( '../../api/generated/api.js', () => ( {
+vi.mock( '#api/generated/api.js', () => ( {
   postWorkflowStart: vi.fn()
 } ) );
 
+vi.mock( '#utils/resolve_input.js', () => ( {
+  resolveInput: vi.fn()
+} ) );
+
 describe( 'workflow start command', () => {
-  beforeEach( () => {
+  beforeEach( async () => {
     vi.clearAllMocks();
     delete process.env.OUTPUT_CATALOG_ID;
+    const { resolveInput } = await import( '#utils/resolve_input.js' );
+    vi.mocked( resolveInput ).mockResolvedValue( {} );
   } );
 
   describe( 'command definition', () => {
@@ -30,5 +37,62 @@ describe( 'workflow start command', () => {
       expect( WorkflowStart.args ).toHaveProperty( 'scenario' );
       expect( WorkflowStart.args.scenario.required ).toBe( false );
     } );
+
+    it( 'binds the catalog flag to OUTPUT_CATALOG_ID', async () => {
+      const WorkflowStart = ( await import( './start.js' ) ).default;
+      expect( WorkflowStart.flags.catalog.env ).toBe( 'OUTPUT_CATALOG_ID' );
+      expect( WorkflowStart.flags.catalog.char ).toBe( 'c' );
+    } );
+  } );
+
+  describe( 'run()', () => {
+    const createCommand = async ( flagOverrides: Record<string, unknown> = {} ) => {
+      const WorkflowStart = ( await import( './start.js' ) ).default;
+      const { postWorkflowStart } = await import( '#api/generated/api.js' );
+      const { resolveInput } = await import( '#utils/resolve_input.js' );
+
+      const cmd = new WorkflowStart( [ 'my_workflow' ], {} as any );
+      cmd.log = vi.fn();
+      cmd.error = vi.fn( () => {
+        throw new Error( 'error called' );
+      } ) as any;
+      ( cmd as any ).parse = vi.fn().mockResolvedValue( {
+        args: { workflowName: 'my_workflow', scenario: undefined },
+        flags: { input: undefined, catalog: undefined, ...flagOverrides }
+      } );
+
+      return { cmd, postWorkflowStart: vi.mocked( postWorkflowStart ), resolveInput: vi.mocked( resolveInput ) };
+    };
+
+    it( 'threads the resolved catalog to resolveInput and postWorkflowStart', async () => {
+      const { cmd, postWorkflowStart, resolveInput } = await createCommand( { catalog: 'my-catalog' } );
+      resolveInput.mockResolvedValue( { key: 'value' } );
+      postWorkflowStart.mockResolvedValue( {
+        data: { workflowId: 'wf-123' },
+        status: 200,
+        headers: new Headers()
+      } as any );
+
+      await cmd.run();
+
+      expect( resolveInput ).toHaveBeenCalledWith( 'my_workflow', undefined, undefined, 'start', 'my-catalog' );
+      expect( postWorkflowStart ).toHaveBeenCalledWith(
+        expect.objectContaining( { workflowName: 'my_workflow', catalog: 'my-catalog' } )
+      );
+    } );
+
+    it( 'passes undefined catalog through when none is set', async () => {
+      const { cmd, postWorkflowStart, resolveInput } = await createCommand();
+      resolveInput.mockResolvedValue( {} );
+      postWorkflowStart.mockResolvedValue( {
+        data: { workflowId: 'wf-123' },
+        status: 200,
+        headers: new Headers()
+      } as any );
+
+      await cmd.run();
+
+      expect( resolveInput ).toHaveBeenCalledWith( 'my_workflow', undefined, undefined, 'start', undefined );
+    } );
   } );
 } );
diff --git a/sdk/cli/src/commands/workflow/start.ts b/sdk/cli/src/commands/workflow/start.ts
@@ -43,7 +43,7 @@ export default class WorkflowStart extends Command {
   async run(): Promise<void> {
     const { args, flags } = await this.parse( WorkflowStart );
 
-    const input = await resolveInput( args.workflowName, args.scenario, flags.input, 'start' );
+    const input = await resolveInput( args.workflowName, args.scenario, flags.input, 'start', flags.catalog );
 
     this.log( `Starting workflow: ${args.workflowName}...` );
 

diff --git a/sdk/cli/src/commands/workflow/test_eval.spec.ts b/sdk/cli/src/commands/workflow/test_eval.spec.ts
@@ -7,11 +7,19 @@ vi.mock( '#api/generated/api.js', () => ( {
   postWorkflowRun: vi.fn()
 } ) );
 
+vi.mock( '#api/workflow_catalog.js', () => ( {
+  fetchWorkflowCatalog: vi.fn()
+} ) );
+
 vi.mock( '#services/datasets.js', () => ( {
   readAllDatasets: vi.fn(),
   writeDataset: vi.fn()
 } ) );
 
+vi.mock( '#utils/eval_diagnostics.js', () => ( {
+  diagnoseMissingEvalWorkflow: vi.fn().mockResolvedValue( 'missing eval workflow' )
+} ) );
+
 const passingOutput: EvalOutput = {
   cases: [ { datasetName: 'd1', verdict: 'pass', evaluators: [] } ],
   summary: { total: 1, passed: 1, partial: 0, failed: 0, acceptableRate: 1 }
@@ -31,10 +39,16 @@ describe( 'workflow test command', () => {
     process.exitCode = undefined;
 
     const { readAllDatasets } = await import( '#services/datasets.js' );
+    const { fetchWorkflowCatalog } = await import( '#api/workflow_catalog.js' );
     vi.mocked( readAllDatasets ).mockResolvedValue( {
       datasets: [ { name: 'd1', input: {}, last_output: { output: {}, date: '2026-01-01' } } as any ],
       dir: '/tmp/datasets'
     } );
+    // Catalog includes both eval names so ensureEvalWorkflowRegistered passes deterministically.
+    vi.mocked( fetchWorkflowCatalog ).mockResolvedValue( [
+      { name: getEvalWorkflowName( 'simple' ) },
+      { name: getEvalWorkflowName( 'my_workflow' ) }
+    ] as any );
   } );
 
   afterEach( () => {
@@ -46,6 +60,13 @@ describe( 'workflow test command', () => {
       const WorkflowTest = ( await import( './test_eval.js' ) ).default;
       expect( WorkflowTest.enableJsonFlag ).toBe( true );
     } );
+
+    it( 'binds the catalog flag to OUTPUT_CATALOG_ID', async () => {
+      const WorkflowTest = ( await import( './test_eval.js' ) ).default;
+      expect( WorkflowTest.flags ).toHaveProperty( 'catalog' );
+      expect( WorkflowTest.flags.catalog.env ).toBe( 'OUTPUT_CATALOG_ID' );
+      expect( WorkflowTest.flags.catalog.char ).toBe( 'c' );
+    } );
   } );
 
   describe( 'run()', () => {
@@ -105,5 +126,37 @@ describe( 'workflow test command', () => {
       expect( result ).toEqual( failingOutput );
       expect( process.exitCode ).toBe( 1 );
     } );
+
+    it( 'routes registration, dataset runs, and the eval run to the resolved catalog', async () => {
+      const WorkflowTest = ( await import( './test_eval.js' ) ).default;
+      const { postWorkflowRun } = await import( '#api/generated/api.js' );
+      const { fetchWorkflowCatalog } = await import( '#api/workflow_catalog.js' );
+
+      const cmd = new WorkflowTest( [ 'my_workflow' ], {} as any );
+      cmd.log = vi.fn();
+      ( cmd as any ).jsonEnabled = vi.fn().mockReturnValue( false );
+      ( cmd as any ).parse = vi.fn().mockResolvedValue( {
+        args: { workflowName: 'my_workflow' },
+        flags: { catalog: 'my-catalog', cached: false, save: false, dataset: undefined }
+      } );
+
+      vi.mocked( postWorkflowRun )
+        .mockResolvedValueOnce( { data: { output: {} }, status: 200, headers: new Headers() } as any )
+        .mockResolvedValueOnce( { data: { output: passingOutput }, status: 200, headers: new Headers() } as any );
+
+      await cmd.run();
+
+      expect( vi.mocked( fetchWorkflowCatalog ) ).toHaveBeenCalledWith( 'my-catalog' );
+      expect( postWorkflowRun ).toHaveBeenNthCalledWith(
+        1,
+        expect.objectContaining( { workflowName: 'my_workflow', catalog: 'my-catalog' } ),
+        expect.anything()
+      );
+      expect( postWorkflowRun ).toHaveBeenNthCalledWith(
+        2,
+        expect.objectContaining( { workflowName: getEvalWorkflowName( 'my_workflow' ), catalog: 'my-catalog' } ),
+        expect.anything()
+      );
+    } );
   } );
 } );