fix: drive codex loop metrics from mcp events

2026-06-16 08:25:14 +02:00 · 2026-06-01 18:06:37 +02:00 · 2026-06-01 18:06:37 +02:00 · f27fc9c9a5
commit f27fc9c9a5
parent 1430ca49eb
4 changed files with 132 additions and 49 deletions
--- a/packages/cli/src/context/llm/codex-exec-events.ts
+++ b/packages/cli/src/context/llm/codex-exec-events.ts
@ -81,7 +81,8 @@ export function summarizeCodexExecEvents(
  let finalText = '';
  let stopReason: RunLoopStopReason = 'natural';
  let usage: LlmTokenUsage = {};
-  let stepCount = 0;
+  let turnCount = 0;
+  let completedToolStepCount = 0;
  const stepBoundariesMs: number[] = [];
  let toolCallCount = 0;
  const toolFailures: string[] = [];
@ -95,13 +96,38 @@ export function summarizeCodexExecEvents(
    }

    if (eventType === 'turn.started') {
-      stepCount += 1;
+      turnCount += 1;
+      continue;
+    }
+
+    const item = record(eventRecord.item);
+    const itemType = text(item?.type);
+
+    if (eventType === 'item.started' && itemType === 'mcp_tool_call') {
+      toolCallCount += 1;
+      continue;
+    }
+
+    if (eventType === 'item.completed' && itemType === 'mcp_tool_call') {
+      completedToolStepCount += 1;
+      stepBoundariesMs.push(now() - startedAt);
+      if (item?.error !== undefined || item?.status === 'failed') {
+        const name = text(item.name) ?? text(item.tool) ?? text(item.tool_name) ?? 'unknown';
+        toolFailures.push(`${name}: ${errorMessageFrom(item.error)}`);
+      }
+      continue;
+    }
+
+    if (eventType === 'item.completed' && itemType === 'agent_message') {
+      finalText = text(item?.text) ?? finalText;
      continue;
    }

    if (eventType === 'turn.completed') {
      usage = usageFrom(eventRecord.usage);
-      stepBoundariesMs.push(now() - startedAt);
+      if (completedToolStepCount === 0) {
+        stepBoundariesMs.push(now() - startedAt);
+      }
      stopReason = stopReasonFrom(eventRecord.reason ?? eventRecord.stop_reason ?? eventRecord.terminal_reason);
      continue;
    }
@ -109,28 +135,6 @@ export function summarizeCodexExecEvents(
    if (eventType === 'turn.failed' || eventType === 'error') {
      stopReason = 'error';
      error = new Error(errorMessageFrom(eventRecord.error ?? eventRecord.message));
-      continue;
-    }
-
-    const item = record(eventRecord.item);
-    const itemType = text(item?.type);
-    if (!item || !itemType) {
-      continue;
-    }
-
-    if (eventType === 'item.completed' && itemType === 'agent_message') {
-      finalText = text(item.text) ?? finalText;
-      continue;
-    }
-
-    if (eventType === 'item.started' && itemType === 'mcp_tool_call') {
-      toolCallCount += 1;
-      continue;
-    }
-
-    if (eventType === 'item.completed' && itemType === 'mcp_tool_call' && item.error !== undefined) {
-      const name = text(item.name) ?? text(item.tool) ?? text(item.tool_name) ?? 'unknown';
-      toolFailures.push(`${name}: ${errorMessageFrom(item.error)}`);
    }
  }

@ -138,7 +142,7 @@ export function summarizeCodexExecEvents(
    finalText,
    stopReason,
    usage,
-    stepCount,
+    stepCount: completedToolStepCount > 0 ? completedToolStepCount : turnCount,
    stepBoundariesMs,
    toolCallCount,
    toolFailures,
--- a/packages/cli/src/context/llm/codex-runtime.ts
+++ b/packages/cli/src/context/llm/codex-runtime.ts
@ -46,9 +46,20 @@ function metrics(summary: CodexExecEventSummary, startedAt: number): { totalMs:
  return { totalMs: Date.now() - startedAt, usage: summary.usage };
 }

-function assertSuccessfulText(summary: CodexExecEventSummary): string {
+function summaryError(summary: CodexExecEventSummary): Error | undefined {
  if (summary.error) {
-    throw summary.error;
+    return summary.error;
+  }
+  if (summary.toolFailures.length > 0) {
+    return new Error(`Codex runtime tool call failed: ${summary.toolFailures.join('; ')}`);
+  }
+  return undefined;
+}
+
+function assertSuccessfulText(summary: CodexExecEventSummary): string {
+  const error = summaryError(summary);
+  if (error) {
+    throw error;
  }
  if (!summary.finalText.trim()) {
    throw new Error('Codex completed without an agent message');
@ -215,9 +226,11 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
          );
        }
      }
+      const error = summaryError(summary);
+      const stopReason = error ? 'error' : summary.stopReason;
      return {
-        stopReason: summary.stopReason,
-        ...(summary.stopReason === 'error' && summary.error ? { error: summary.error } : {}),
+        stopReason,
+        ...(stopReason === 'error' && error ? { error } : {}),
        metrics: {
          totalMs: Date.now() - startedAt,
          usage: summary.usage,
--- a/packages/cli/test/context/llm/codex-exec-events.test.ts
+++ b/packages/cli/test/context/llm/codex-exec-events.test.ts
@ -5,7 +5,7 @@ import {
 } from '../../../src/context/llm/codex-exec-events.js';

 describe('Codex exec event parsing', () => {
-  it('captures final agent text, SDK usage, steps, and natural completion', () => {
+  it('uses the completed turn as one step when no MCP tools run', () => {
    const summary = summarizeCodexExecEvents(
      [
        { type: 'thread.started', thread_id: 'thr_1' },
@ -35,6 +35,52 @@ describe('Codex exec event parsing', () => {
    });
  });

+  it('uses completed MCP tool calls as loop steps', () => {
+    const offsets = [115, 140, 175];
+    const summary = summarizeCodexExecEvents(
+      [
+        { type: 'turn.started' },
+        {
+          type: 'item.started',
+          item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'in_progress' },
+        },
+        {
+          type: 'item.completed',
+          item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'completed' },
+        },
+        {
+          type: 'item.started',
+          item: { id: 'call_2', type: 'mcp_tool_call', server: 'ktx', tool: 'lookup', arguments: {}, status: 'in_progress' },
+        },
+        {
+          type: 'item.completed',
+          item: {
+            id: 'call_2',
+            type: 'mcp_tool_call',
+            server: 'ktx',
+            tool: 'lookup',
+            arguments: {},
+            status: 'failed',
+            error: { message: 'denied' },
+          },
+        },
+        { type: 'item.completed', item: { id: 'item_1', type: 'agent_message', text: 'done' } },
+        { type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1, cached_input_tokens: 0, reasoning_output_tokens: 0 } },
+      ],
+      { startedAt: 100, now: () => offsets.shift() ?? 175 },
+    );
+
+    expect(summary).toEqual({
+      finalText: 'done',
+      stopReason: 'natural',
+      usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
+      stepCount: 2,
+      stepBoundariesMs: [15, 40],
+      toolCallCount: 2,
+      toolFailures: ['lookup: denied'],
+    });
+  });
+
  it('maps turn failures into error stop reason', () => {
    const summary = summarizeCodexExecEvents([
      { type: 'turn.started' },
@ -54,24 +100,6 @@ describe('Codex exec event parsing', () => {
    expect(summary.stopReason).toBe('budget');
  });

-  it('counts SDK-shaped MCP tool calls and failed MCP tool calls', () => {
-    const summary = summarizeCodexExecEvents([
-      { type: 'turn.started' },
-      {
-        type: 'item.started',
-        item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: { query: 'revenue' }, status: 'in_progress' },
-      },
-      {
-        type: 'item.completed',
-        item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: { query: 'revenue' }, status: 'failed', error: { message: 'denied' } },
-      },
-      { type: 'turn.completed' },
-    ]);
-
-    expect(summary.toolCallCount).toBe(1);
-    expect(summary.toolFailures).toEqual(['search: denied']);
-  });
-
  it('throws a clear error for malformed JSONL lines', () => {
    expect(() => parseCodexExecEventLine('{not-json')).toThrow('Codex JSONL event stream was malformed');
  });
--- a/packages/cli/test/context/llm/codex-runtime.test.ts
+++ b/packages/cli/test/context/llm/codex-runtime.test.ts
@ -169,6 +169,44 @@ describe('CodexKtxLlmRuntime', () => {
    expect(result.error?.message).toBe('boom');
  });

+  it('surfaces failed MCP tool calls as agent-loop errors', async () => {
+    const runtime = new CodexKtxLlmRuntime({
+      projectDir: '/tmp/project',
+      modelSlots: { default: 'codex' },
+      runner: runner([
+        { type: 'turn.started' },
+        { type: 'item.started', item: { type: 'mcp_tool_call', server: 'ktx', tool: 'search', status: 'in_progress' } },
+        {
+          type: 'item.completed',
+          item: {
+            type: 'mcp_tool_call',
+            server: 'ktx',
+            tool: 'search',
+            status: 'failed',
+            error: { message: 'denied' },
+          },
+        },
+        { type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1 } },
+      ]),
+    });
+
+    const result = await runtime.runAgentLoop({
+      modelRole: 'default',
+      systemPrompt: 'system',
+      userPrompt: 'user',
+      stepBudget: 5,
+      telemetryTags: {},
+      toolSet: {},
+    });
+
+    expect(result.stopReason).toBe('error');
+    expect(result.error?.message).toBe('Codex runtime tool call failed: search: denied');
+    expect(result.metrics).toMatchObject({
+      stepCount: 1,
+      usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
+    });
+  });
+
  it('probes Codex authentication through a minimal non-interactive turn', async () => {
    const fakeRunner = runner([
      { type: 'turn.started' },