fix: drive codex loop metrics from mcp events

This commit is contained in:
Andrey Avtomonov 2026-06-01 18:06:37 +02:00
parent 1430ca49eb
commit f27fc9c9a5
4 changed files with 132 additions and 49 deletions

View file

@ -81,7 +81,8 @@ export function summarizeCodexExecEvents(
let finalText = '';
let stopReason: RunLoopStopReason = 'natural';
let usage: LlmTokenUsage = {};
let stepCount = 0;
let turnCount = 0;
let completedToolStepCount = 0;
const stepBoundariesMs: number[] = [];
let toolCallCount = 0;
const toolFailures: string[] = [];
@ -95,13 +96,38 @@ export function summarizeCodexExecEvents(
}
if (eventType === 'turn.started') {
stepCount += 1;
turnCount += 1;
continue;
}
const item = record(eventRecord.item);
const itemType = text(item?.type);
if (eventType === 'item.started' && itemType === 'mcp_tool_call') {
toolCallCount += 1;
continue;
}
if (eventType === 'item.completed' && itemType === 'mcp_tool_call') {
completedToolStepCount += 1;
stepBoundariesMs.push(now() - startedAt);
if (item?.error !== undefined || item?.status === 'failed') {
const name = text(item.name) ?? text(item.tool) ?? text(item.tool_name) ?? 'unknown';
toolFailures.push(`${name}: ${errorMessageFrom(item.error)}`);
}
continue;
}
if (eventType === 'item.completed' && itemType === 'agent_message') {
finalText = text(item?.text) ?? finalText;
continue;
}
if (eventType === 'turn.completed') {
usage = usageFrom(eventRecord.usage);
stepBoundariesMs.push(now() - startedAt);
if (completedToolStepCount === 0) {
stepBoundariesMs.push(now() - startedAt);
}
stopReason = stopReasonFrom(eventRecord.reason ?? eventRecord.stop_reason ?? eventRecord.terminal_reason);
continue;
}
@ -109,28 +135,6 @@ export function summarizeCodexExecEvents(
if (eventType === 'turn.failed' || eventType === 'error') {
stopReason = 'error';
error = new Error(errorMessageFrom(eventRecord.error ?? eventRecord.message));
continue;
}
const item = record(eventRecord.item);
const itemType = text(item?.type);
if (!item || !itemType) {
continue;
}
if (eventType === 'item.completed' && itemType === 'agent_message') {
finalText = text(item.text) ?? finalText;
continue;
}
if (eventType === 'item.started' && itemType === 'mcp_tool_call') {
toolCallCount += 1;
continue;
}
if (eventType === 'item.completed' && itemType === 'mcp_tool_call' && item.error !== undefined) {
const name = text(item.name) ?? text(item.tool) ?? text(item.tool_name) ?? 'unknown';
toolFailures.push(`${name}: ${errorMessageFrom(item.error)}`);
}
}
@ -138,7 +142,7 @@ export function summarizeCodexExecEvents(
finalText,
stopReason,
usage,
stepCount,
stepCount: completedToolStepCount > 0 ? completedToolStepCount : turnCount,
stepBoundariesMs,
toolCallCount,
toolFailures,

View file

@ -46,9 +46,20 @@ function metrics(summary: CodexExecEventSummary, startedAt: number): { totalMs:
return { totalMs: Date.now() - startedAt, usage: summary.usage };
}
function assertSuccessfulText(summary: CodexExecEventSummary): string {
function summaryError(summary: CodexExecEventSummary): Error | undefined {
if (summary.error) {
throw summary.error;
return summary.error;
}
if (summary.toolFailures.length > 0) {
return new Error(`Codex runtime tool call failed: ${summary.toolFailures.join('; ')}`);
}
return undefined;
}
function assertSuccessfulText(summary: CodexExecEventSummary): string {
const error = summaryError(summary);
if (error) {
throw error;
}
if (!summary.finalText.trim()) {
throw new Error('Codex completed without an agent message');
@ -215,9 +226,11 @@ export class CodexKtxLlmRuntime implements KtxLlmRuntimePort {
);
}
}
const error = summaryError(summary);
const stopReason = error ? 'error' : summary.stopReason;
return {
stopReason: summary.stopReason,
...(summary.stopReason === 'error' && summary.error ? { error: summary.error } : {}),
stopReason,
...(stopReason === 'error' && error ? { error } : {}),
metrics: {
totalMs: Date.now() - startedAt,
usage: summary.usage,

View file

@ -5,7 +5,7 @@ import {
} from '../../../src/context/llm/codex-exec-events.js';
describe('Codex exec event parsing', () => {
it('captures final agent text, SDK usage, steps, and natural completion', () => {
it('uses the completed turn as one step when no MCP tools run', () => {
const summary = summarizeCodexExecEvents(
[
{ type: 'thread.started', thread_id: 'thr_1' },
@ -35,6 +35,52 @@ describe('Codex exec event parsing', () => {
});
});
it('uses completed MCP tool calls as loop steps', () => {
const offsets = [115, 140, 175];
const summary = summarizeCodexExecEvents(
[
{ type: 'turn.started' },
{
type: 'item.started',
item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'in_progress' },
},
{
type: 'item.completed',
item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: {}, status: 'completed' },
},
{
type: 'item.started',
item: { id: 'call_2', type: 'mcp_tool_call', server: 'ktx', tool: 'lookup', arguments: {}, status: 'in_progress' },
},
{
type: 'item.completed',
item: {
id: 'call_2',
type: 'mcp_tool_call',
server: 'ktx',
tool: 'lookup',
arguments: {},
status: 'failed',
error: { message: 'denied' },
},
},
{ type: 'item.completed', item: { id: 'item_1', type: 'agent_message', text: 'done' } },
{ type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1, cached_input_tokens: 0, reasoning_output_tokens: 0 } },
],
{ startedAt: 100, now: () => offsets.shift() ?? 175 },
);
expect(summary).toEqual({
finalText: 'done',
stopReason: 'natural',
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
stepCount: 2,
stepBoundariesMs: [15, 40],
toolCallCount: 2,
toolFailures: ['lookup: denied'],
});
});
it('maps turn failures into error stop reason', () => {
const summary = summarizeCodexExecEvents([
{ type: 'turn.started' },
@ -54,24 +100,6 @@ describe('Codex exec event parsing', () => {
expect(summary.stopReason).toBe('budget');
});
it('counts SDK-shaped MCP tool calls and failed MCP tool calls', () => {
const summary = summarizeCodexExecEvents([
{ type: 'turn.started' },
{
type: 'item.started',
item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: { query: 'revenue' }, status: 'in_progress' },
},
{
type: 'item.completed',
item: { id: 'call_1', type: 'mcp_tool_call', server: 'ktx', tool: 'search', arguments: { query: 'revenue' }, status: 'failed', error: { message: 'denied' } },
},
{ type: 'turn.completed' },
]);
expect(summary.toolCallCount).toBe(1);
expect(summary.toolFailures).toEqual(['search: denied']);
});
it('throws a clear error for malformed JSONL lines', () => {
expect(() => parseCodexExecEventLine('{not-json')).toThrow('Codex JSONL event stream was malformed');
});

View file

@ -169,6 +169,44 @@ describe('CodexKtxLlmRuntime', () => {
expect(result.error?.message).toBe('boom');
});
it('surfaces failed MCP tool calls as agent-loop errors', async () => {
const runtime = new CodexKtxLlmRuntime({
projectDir: '/tmp/project',
modelSlots: { default: 'codex' },
runner: runner([
{ type: 'turn.started' },
{ type: 'item.started', item: { type: 'mcp_tool_call', server: 'ktx', tool: 'search', status: 'in_progress' } },
{
type: 'item.completed',
item: {
type: 'mcp_tool_call',
server: 'ktx',
tool: 'search',
status: 'failed',
error: { message: 'denied' },
},
},
{ type: 'turn.completed', usage: { input_tokens: 1, output_tokens: 1 } },
]),
});
const result = await runtime.runAgentLoop({
modelRole: 'default',
systemPrompt: 'system',
userPrompt: 'user',
stepBudget: 5,
telemetryTags: {},
toolSet: {},
});
expect(result.stopReason).toBe('error');
expect(result.error?.message).toBe('Codex runtime tool call failed: search: denied');
expect(result.metrics).toMatchObject({
stepCount: 1,
usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
});
});
it('probes Codex authentication through a minimal non-interactive turn', async () => {
const fakeRunner = runner([
{ type: 'turn.started' },