ktx/packages/cli/test/context/llm/codex-exec-events.test.ts

import { describe, expect, it } from 'vitest';
import {
  parseCodexExecEventLine,
  summarizeCodexExecEvents,
} from '../../../src/context/llm/codex-exec-events.js';

describe('Codex exec event parsing', () => {
  it('uses the completed turn as one step when no MCP tools run', () => {
    const summary = summarizeCodexExecEvents(
      [
        { type: 'thread.started', thread_id: 'thr_1' },
        { type: 'turn.started' },
        { type: 'item.completed', item: { id: 'item_1', type: 'agent_message', text: 'hello from codex' } },
        {
          type: 'turn.completed',
          usage: {
            input_tokens: 12,
            cached_input_tokens: 4,
            output_tokens: 5,
            reasoning_output_tokens: 2,
          },
        },
      ],
      { startedAt: 100, now: () => 125 },
    );

    expect(summary).toEqual({
      finalText: 'hello from codex',
      stopReason: 'natural',
      usage: { inputTokens: 12, outputTokens: 5, totalTokens: 17 },
      stepCount: 1,
      stepBoundariesMs: [25],
      toolCallCount: 0,
      toolFailures: [],
    });
  });

  it('uses completed MCP tool calls as loop steps', () => {
    const offsets = [115, 140, 175];
    const summary = summarizeCodexExecEvents(
      [
        { type: 'turn.started' },
        {
          type: 'item.started',
          item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'in_progress' },
        },
        {
          type: 'item.completed',
          item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'completed' },
        },
        {
          type: 'item.started',
          item: { id: 'call_2', type: 'mcp_tool_call', server: 'ktx', tool: 'lookup', arguments: {}, status: 'in_progress' },
        },
        {
          type: 'item.completed',
          item: {
            id: 'call_2',
            type: 'mcp_tool_call',
            server: 'ktx',
            tool: 'lookup',
            arguments: {},
            status: 'failed',
            error: { message: 'denied' },
          },
        },
        { type: 'item.completed', item: { id: 'item_1', type: 'agent_message', text: 'done' } },
        { type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1, cached_input_tokens: 0, reasoning_output_tokens: 0 } },
      ],
      { startedAt: 100, now: () => offsets.shift() ?? 175 },
    );

    expect(summary).toEqual({
      finalText: 'done',
      stopReason: 'natural',
      usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
      stepCount: 2,
      stepBoundariesMs: [15, 40],
      toolCallCount: 2,
      toolFailures: ['lookup: denied'],
    });
  });

  it('does not treat a completed MCP tool call as failed when Codex sends error: null', () => {
    // Captured verbatim from a real @openai/codex-sdk run: successful tool calls
    // carry `error: null` and `result` alongside `status: "completed"`.
    const summary = summarizeCodexExecEvents([
      { type: 'turn.started' },
      {
        type: 'item.started',
        item: {
          id: 'item_1',
          type: 'mcp_tool_call',
          server: 'ktx',
          tool: 'echo_value',
          arguments: { value: 'ktx_codex_tool_ok' },
          result: null,
          error: null,
          status: 'in_progress',
        },
      },
      {
        type: 'item.completed',
        item: {
          id: 'item_1',
          type: 'mcp_tool_call',
          server: 'ktx',
          tool: 'echo_value',
          arguments: { value: 'ktx_codex_tool_ok' },
          result: { content: [{ type: 'text', text: 'echo:ktx_codex_tool_ok' }], structured_content: null },
          error: null,
          status: 'completed',
        },
      },
      { type: 'item.completed', item: { id: 'm1', type: 'agent_message', text: 'done' } },
      { type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1 } },
    ]);

    expect(summary.toolFailures).toEqual([]);
    expect(summary.toolCallCount).toBe(1);
  });

  it('counts built-in command executions as loop steps without failing the loop', () => {
    const offsets = [110, 130];
    const summary = summarizeCodexExecEvents(
      [
        { type: 'turn.started' },
        { type: 'item.completed', item: { id: 'c1', type: 'command_execution', command: 'ls', status: 'completed', exit_code: 0 } },
        { type: 'item.completed', item: { id: 'c2', type: 'command_execution', command: 'cat missing', status: 'failed', exit_code: 1 } },
        { type: 'item.completed', item: { id: 'm1', type: 'agent_message', text: 'done' } },
        { type: 'turn.completed', usage: { input_tokens: 2, output_tokens: 1 } },
      ],
      { startedAt: 100, now: () => offsets.shift() ?? 130 },
    );

    expect(summary.stepCount).toBe(2);
    expect(summary.stepBoundariesMs).toEqual([10, 30]);
    // A non-zero command exit is normal agent exploration, not a runtime tool failure.
    expect(summary.toolFailures).toEqual([]);
    expect(summary.toolCallCount).toBe(0);
  });

  it('maps turn failures into error stop reason', () => {
    const summary = summarizeCodexExecEvents([
      { type: 'turn.started' },
      { type: 'turn.failed', error: { message: 'Codex could not connect to required MCP server' } },
    ]);

    expect(summary.stopReason).toBe('error');
    expect(summary.error?.message).toContain('Codex could not connect to required MCP server');
  });

  it('unwraps the Codex API error envelope into its human-readable message', () => {
    // Codex serializes API errors as a JSON envelope inside the event message.
    const apiError = JSON.stringify({
      type: 'error',
      status: 400,
      error: {
        type: 'invalid_request_error',
        message: "The 'gpt-5.3-codex' model is not supported when using Codex with a ChatGPT account.",
      },
    });
    const summary = summarizeCodexExecEvents([
      { type: 'thread.started', thread_id: 'thr_1' },
      { type: 'turn.started' },
      { type: 'error', message: apiError },
      { type: 'turn.failed', error: { message: apiError } },
    ]);

    expect(summary.stopReason).toBe('error');
    expect(summary.error?.message).toBe(
      "The 'gpt-5.3-codex' model is not supported when using Codex with a ChatGPT account.",
    );
  });

  it('maps max-turns terminal reasons into budget stop reason when Codex emits one', () => {
    const summary = summarizeCodexExecEvents([
      { type: 'turn.started' },
      { type: 'turn.completed', reason: 'max_turns', usage: { input_tokens: 1, output_tokens: 1 } },
    ]);

    expect(summary.stopReason).toBe('budget');
  });

  it('throws a clear error for malformed JSONL lines', () => {
    expect(() => parseCodexExecEventLine('{not-json')).toThrow('Codex JSONL event stream was malformed');
  });
});
feat: add codex llm backend for ktx runtime work (#253) * feat: add codex sdk runner foundation * feat: parse codex runtime events * feat: expose codex runtime mcp tools * feat: add codex llm runtime * feat: wire codex llm backend * test: avoid Array.fromAsync in codex runner test * docs: document codex llm backend * fix: tighten codex runtime config ownership * fix: use codex sdk env and thread options * fix: parse codex sdk event shapes * test: add codex backend live smoke * docs: clarify codex backend isolation * fix: drive codex loop metrics from mcp events * fix: enforce codex local step budget * docs: disclose codex isolation limits * fix: count all codex agent steps and stream step callbacks live The agent-loop step budget only counted completed mcp_tool_call items, so built-in command_execution steps (which the public Codex SDK/CLI surface can still expose) never decremented the budget, letting ingest/reconciliation run past stepBudget until Codex stopped on its own. onStepFinish was also replayed only after the whole stream drained, so live work_unit_step / reconciliation progress appeared stuck until the Codex process exited. collectEvents is now the single live step accumulator: it counts every completed agent-action item via a shared isCompletedAgentStep predicate (command_execution, mcp_tool_call, file_change, web_search), fires onStepFinish as each step completes, and enforces the budget on that broader count. A no-tool turn still counts as one step. toolFailures stays MCP-specific, since a non-zero command exit is normal agent exploration, not a loop failure. * test: align ingest llm-guard assertions with codex backend The skip-llm ingest guard message now lists codex as a valid backend and mentions a Claude Code/Codex session plus a codex setup hint, but this slow suite test still asserted the pre-codex wording. Update it to match the production message (already covered by the local-bundle-runtime unit test) and add the codex setup-line assertion. * fix: treat codex error:null tool calls as success The Codex SDK serializes error: null on successful mcp_tool_call items, so the failure check (item.error !== undefined) flagged every successful tool call as failed with the empty-payload default "Codex turn failed". This killed every ingest work unit under the codex backend before it could produce a patch. Key on status === 'failed' (authoritative, always set) and only treat a populated error object as a failure. Add a regression test built from a verbatim real-SDK event capture. * fix: default codex backend to gpt-5.5 and report real probe errors The previous default gpt-5.3-codex is an API-key-only model that the OpenAI API rejects under ChatGPT-account (subscription) auth, so codex status/setup failed with a misleading "authentication is not usable" message even though auth was fine. - Default codex model is now gpt-5.5 (works on both subscription and API-key auth); the curated setup picker offers gpt-5.5 / gpt-5.4 / gpt-5.4-mini and keeps free-form entry for account-specific ids (e.g. gpt-5.3-codex-spark). - runCodexAuthProbe now distinguishes "model not available" from an auth failure and surfaces the real API error: collectEvents retains stream events when the SDK throws on a non-zero exit, and the API error JSON envelope is unwrapped to its human-readable message. - The Codex isolation warning now renders inside the clack setup frame. - Docs updated to gpt-5.5 with a note that -codex ids require API-key auth. fix: require llm.models.default in status and match codex probe remediation Status reported a project ready when a non-none LLM backend was configured without llm.models.default, but the runtime (resolveModelSlots) hard-requires it, so ingest/scan/memory threw after `ktx status` said the project was usable. buildLlmStatus now fails for any non-none backend missing models.default and no longer invents a fallback model for claude-code/codex. Codex probe failures now carry a category-matched fix: a model-access failure steers the user at llm.models.default instead of the auth/install remediation. runCodexAuthProbe returns the fix and status consumes it; the message stays self-sufficient so setup output is unchanged. Docs: README now lists the codex backend and local Codex auth; ktx-setup.mdx states --llm-model only accepts codex/default or gpt-/codex- ids. Repaired four doctor fixtures that configured a backend without models.default (the now-correctly-blocked config) and added coverage for the new behavior. 2026-06-02 13:57:11 +02:00			`import { describe, expect, it } from 'vitest';`
			`import {`
			`parseCodexExecEventLine,`
			`summarizeCodexExecEvents,`
			`} from '../../../src/context/llm/codex-exec-events.js';`

			`describe('Codex exec event parsing', () => {`
			`it('uses the completed turn as one step when no MCP tools run', () => {`
			`const summary = summarizeCodexExecEvents(`
			`[`
			`{ type: 'thread.started', thread_id: 'thr_1' },`
			`{ type: 'turn.started' },`
			`{ type: 'item.completed', item: { id: 'item_1', type: 'agent_message', text: 'hello from codex' } },`
			`{`
			`type: 'turn.completed',`
			`usage: {`
			`input_tokens: 12,`
			`cached_input_tokens: 4,`
			`output_tokens: 5,`
			`reasoning_output_tokens: 2,`
			`},`
			`},`
			`],`
			`{ startedAt: 100, now: () => 125 },`
			`);`

			`expect(summary).toEqual({`
			`finalText: 'hello from codex',`
			`stopReason: 'natural',`
			`usage: { inputTokens: 12, outputTokens: 5, totalTokens: 17 },`
			`stepCount: 1,`
			`stepBoundariesMs: [25],`
			`toolCallCount: 0,`
			`toolFailures: [],`
			`});`
			`});`

			`it('uses completed MCP tool calls as loop steps', () => {`
			`const offsets = [115, 140, 175];`
			`const summary = summarizeCodexExecEvents(`
			`[`
			`{ type: 'turn.started' },`
			`{`
			`type: 'item.started',`
			`item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'in_progress' },`
			`},`
			`{`
			`type: 'item.completed',`
			`item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'completed' },`
			`},`
			`{`
			`type: 'item.started',`
			`item: { id: 'call_2', type: 'mcp_tool_call', server: 'ktx', tool: 'lookup', arguments: {}, status: 'in_progress' },`
			`},`
			`{`
			`type: 'item.completed',`
			`item: {`
			`id: 'call_2',`
			`type: 'mcp_tool_call',`
			`server: 'ktx',`
			`tool: 'lookup',`
			`arguments: {},`
			`status: 'failed',`
			`error: { message: 'denied' },`
			`},`
			`},`
			`{ type: 'item.completed', item: { id: 'item_1', type: 'agent_message', text: 'done' } },`
			`{ type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1, cached_input_tokens: 0, reasoning_output_tokens: 0 } },`
			`],`
			`{ startedAt: 100, now: () => offsets.shift() ?? 175 },`
			`);`

			`expect(summary).toEqual({`
			`finalText: 'done',`
			`stopReason: 'natural',`
			`usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },`
			`stepCount: 2,`
			`stepBoundariesMs: [15, 40],`
			`toolCallCount: 2,`
			`toolFailures: ['lookup: denied'],`
			`});`
			`});`

			`it('does not treat a completed MCP tool call as failed when Codex sends error: null', () => {`
			`// Captured verbatim from a real @openai/codex-sdk run: successful tool calls`
			// carry `error: null` and `result` alongside `status: "completed"`.
			`const summary = summarizeCodexExecEvents([`
			`{ type: 'turn.started' },`
			`{`
			`type: 'item.started',`
			`item: {`
			`id: 'item_1',`
			`type: 'mcp_tool_call',`
			`server: 'ktx',`
			`tool: 'echo_value',`
			`arguments: { value: 'ktx_codex_tool_ok' },`
			`result: null,`
			`error: null,`
			`status: 'in_progress',`
			`},`
			`},`
			`{`
			`type: 'item.completed',`
			`item: {`
			`id: 'item_1',`
			`type: 'mcp_tool_call',`
			`server: 'ktx',`
			`tool: 'echo_value',`
			`arguments: { value: 'ktx_codex_tool_ok' },`
			`result: { content: [{ type: 'text', text: 'echo:ktx_codex_tool_ok' }], structured_content: null },`
			`error: null,`
			`status: 'completed',`
			`},`
			`},`
			`{ type: 'item.completed', item: { id: 'm1', type: 'agent_message', text: 'done' } },`
			`{ type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1 } },`
			`]);`

			`expect(summary.toolFailures).toEqual([]);`
			`expect(summary.toolCallCount).toBe(1);`
			`});`

			`it('counts built-in command executions as loop steps without failing the loop', () => {`
			`const offsets = [110, 130];`
			`const summary = summarizeCodexExecEvents(`
			`[`
			`{ type: 'turn.started' },`
			`{ type: 'item.completed', item: { id: 'c1', type: 'command_execution', command: 'ls', status: 'completed', exit_code: 0 } },`
			`{ type: 'item.completed', item: { id: 'c2', type: 'command_execution', command: 'cat missing', status: 'failed', exit_code: 1 } },`
			`{ type: 'item.completed', item: { id: 'm1', type: 'agent_message', text: 'done' } },`
			`{ type: 'turn.completed', usage: { input_tokens: 2, output_tokens: 1 } },`
			`],`
			`{ startedAt: 100, now: () => offsets.shift() ?? 130 },`
			`);`

			`expect(summary.stepCount).toBe(2);`
			`expect(summary.stepBoundariesMs).toEqual([10, 30]);`
			`// A non-zero command exit is normal agent exploration, not a runtime tool failure.`
			`expect(summary.toolFailures).toEqual([]);`
			`expect(summary.toolCallCount).toBe(0);`
			`});`

			`it('maps turn failures into error stop reason', () => {`
			`const summary = summarizeCodexExecEvents([`
			`{ type: 'turn.started' },`
			`{ type: 'turn.failed', error: { message: 'Codex could not connect to required MCP server' } },`
			`]);`

			`expect(summary.stopReason).toBe('error');`
			`expect(summary.error?.message).toContain('Codex could not connect to required MCP server');`
			`});`

			`it('unwraps the Codex API error envelope into its human-readable message', () => {`
			`// Codex serializes API errors as a JSON envelope inside the event message.`
			`const apiError = JSON.stringify({`
			`type: 'error',`
			`status: 400,`
			`error: {`
			`type: 'invalid_request_error',`
			`message: "The 'gpt-5.3-codex' model is not supported when using Codex with a ChatGPT account.",`
			`},`
			`});`
			`const summary = summarizeCodexExecEvents([`
			`{ type: 'thread.started', thread_id: 'thr_1' },`
			`{ type: 'turn.started' },`
			`{ type: 'error', message: apiError },`
			`{ type: 'turn.failed', error: { message: apiError } },`
			`]);`

			`expect(summary.stopReason).toBe('error');`
			`expect(summary.error?.message).toBe(`
			`"The 'gpt-5.3-codex' model is not supported when using Codex with a ChatGPT account.",`
			`);`
			`});`

			`it('maps max-turns terminal reasons into budget stop reason when Codex emits one', () => {`
			`const summary = summarizeCodexExecEvents([`
			`{ type: 'turn.started' },`
			`{ type: 'turn.completed', reason: 'max_turns', usage: { input_tokens: 1, output_tokens: 1 } },`
			`]);`

			`expect(summary.stopReason).toBe('budget');`
			`});`

			`it('throws a clear error for malformed JSONL lines', () => {`
			`expect(() => parseCodexExecEventLine('{not-json')).toThrow('Codex JSONL event stream was malformed');`
			`});`
			`});`